Index: projects/fuse2/include/stdlib.h =================================================================== --- projects/fuse2/include/stdlib.h (revision 350434) +++ projects/fuse2/include/stdlib.h (revision 350435) @@ -1,347 +1,348 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)stdlib.h 8.5 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _STDLIB_H_ #define _STDLIB_H_ #include #include #include __NULLABILITY_PRAGMA_PUSH #if __BSD_VISIBLE #ifndef _RUNE_T_DECLARED typedef __rune_t rune_t; #define _RUNE_T_DECLARED #endif #endif #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #ifndef __cplusplus #ifndef _WCHAR_T_DECLARED typedef ___wchar_t wchar_t; #define _WCHAR_T_DECLARED #endif #endif typedef struct { int quot; /* quotient */ int rem; /* remainder */ } div_t; typedef struct { long quot; long rem; } ldiv_t; #define EXIT_FAILURE 1 #define EXIT_SUCCESS 0 #define RAND_MAX 0x7ffffffd __BEGIN_DECLS #ifdef _XLOCALE_H_ #include #endif extern int __mb_cur_max; extern int ___mb_cur_max(void); #define MB_CUR_MAX ((size_t)___mb_cur_max()) _Noreturn void abort(void); int abs(int) __pure2; int atexit(void (* _Nonnull)(void)); double atof(const char *); int atoi(const char *); long atol(const char *); void *bsearch(const void *, const void *, size_t, size_t, int (*)(const void * _Nonnull, const void *)); void *calloc(size_t, size_t) __malloc_like __result_use_check __alloc_size2(1, 2); div_t div(int, int) __pure2; _Noreturn void exit(int); void free(void *); char *getenv(const char *); long labs(long) __pure2; ldiv_t ldiv(long, long) __pure2; void *malloc(size_t) __malloc_like __result_use_check __alloc_size(1); int mblen(const char *, size_t); size_t mbstowcs(wchar_t * __restrict , const char * __restrict, size_t); int mbtowc(wchar_t * __restrict, const char * __restrict, size_t); void qsort(void *, size_t, size_t, int (* _Nonnull)(const void *, const void *)); int rand(void); void *realloc(void *, size_t) __result_use_check __alloc_size(2); void srand(unsigned); double strtod(const char * __restrict, char ** __restrict); float strtof(const char * __restrict, char ** __restrict); long strtol(const char * __restrict, char ** __restrict, int); long double strtold(const char * __restrict, char ** __restrict); unsigned long strtoul(const char * __restrict, char ** __restrict, int); int system(const char *); int wctomb(char *, wchar_t); size_t wcstombs(char * __restrict, const wchar_t * __restrict, size_t); /* * Functions added in C99 which we make conditionally available in the * BSD^C89 namespace if the compiler supports `long long'. * The #if test is more complicated than it ought to be because * __BSD_VISIBLE implies __ISO_C_VISIBLE == 1999 *even if* `long long' * is not supported in the compilation environment (which therefore means * that it can't really be ISO C99). * * (The only other extension made by C99 in thie header is _Exit().) */ #if __ISO_C_VISIBLE >= 1999 || defined(__cplusplus) #ifdef __LONG_LONG_SUPPORTED /* LONGLONG */ typedef struct { long long quot; long long rem; } lldiv_t; /* LONGLONG */ long long atoll(const char *); /* LONGLONG */ long long llabs(long long) __pure2; /* LONGLONG */ lldiv_t lldiv(long long, long long) __pure2; /* LONGLONG */ long long strtoll(const char * __restrict, char ** __restrict, int); /* LONGLONG */ unsigned long long strtoull(const char * __restrict, char ** __restrict, int); #endif /* __LONG_LONG_SUPPORTED */ _Noreturn void _Exit(int); #endif /* __ISO_C_VISIBLE >= 1999 */ /* * If we're in a mode greater than C99, expose C11 functions. */ #if __ISO_C_VISIBLE >= 2011 || __cplusplus >= 201103L void * aligned_alloc(size_t, size_t) __malloc_like __alloc_align(1) __alloc_size(2); int at_quick_exit(void (*)(void)); _Noreturn void quick_exit(int); #endif /* __ISO_C_VISIBLE >= 2011 */ /* * Extensions made by POSIX relative to C. */ #if __POSIX_VISIBLE >= 199506 || __XSI_VISIBLE char *realpath(const char * __restrict, char * __restrict); #endif #if __POSIX_VISIBLE >= 199506 int rand_r(unsigned *); /* (TSF) */ #endif #if __POSIX_VISIBLE >= 200112 int posix_memalign(void **, size_t, size_t); /* (ADV) */ int setenv(const char *, const char *, int); int unsetenv(const char *); #endif #if __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE int getsubopt(char **, char *const *, char **); #ifndef _MKDTEMP_DECLARED char *mkdtemp(char *); #define _MKDTEMP_DECLARED #endif #ifndef _MKSTEMP_DECLARED int mkstemp(char *); #define _MKSTEMP_DECLARED #endif #endif /* __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE */ /* * The only changes to the XSI namespace in revision 6 were the deletion * of the ttyslot() and valloc() functions, which FreeBSD never declared * in this header. For revision 7, ecvt(), fcvt(), and gcvt(), which * FreeBSD also does not have, and mktemp(), are to be deleted. */ #if __XSI_VISIBLE /* XXX XSI requires pollution from here. We'd rather not. */ long a64l(const char *); double drand48(void); /* char *ecvt(double, int, int * __restrict, int * __restrict); */ double erand48(unsigned short[3]); /* char *fcvt(double, int, int * __restrict, int * __restrict); */ /* char *gcvt(double, int, int * __restrict, int * __restrict); */ int grantpt(int); char *initstate(unsigned int, char *, size_t); long jrand48(unsigned short[3]); char *l64a(long); void lcong48(unsigned short[7]); long lrand48(void); #if !defined(_MKTEMP_DECLARED) && (__BSD_VISIBLE || __XSI_VISIBLE <= 600) char *mktemp(char *); #define _MKTEMP_DECLARED #endif long mrand48(void); long nrand48(unsigned short[3]); int posix_openpt(int); char *ptsname(int); int putenv(char *); long random(void); unsigned short *seed48(unsigned short[3]); char *setstate(/* const */ char *); void srand48(long); void srandom(unsigned int); int unlockpt(int); #endif /* __XSI_VISIBLE */ #if __BSD_VISIBLE extern const char *malloc_conf; extern void (*malloc_message)(void *, const char *); /* * The alloca() function can't be implemented in C, and on some * platforms it can't be implemented at all as a callable function. * The GNU C compiler provides a built-in alloca() which we can use. * On platforms where alloca() is not in libc, programs which use it * will fail to link when compiled with non-GNU compilers. */ #if __GNUC__ >= 2 || defined(__INTEL_COMPILER) #undef alloca /* some GNU bits try to get cute and define this on their own */ #define alloca(sz) __builtin_alloca(sz) #endif void abort2(const char *, int, void **) __dead2; __uint32_t arc4random(void); void arc4random_buf(void *, size_t); __uint32_t arc4random_uniform(__uint32_t); #ifdef __BLOCKS__ int atexit_b(void (^ _Nonnull)(void)); void *bsearch_b(const void *, const void *, size_t, size_t, int (^ _Nonnull)(const void *, const void *)); #endif char *getbsize(int *, long *); /* getcap(3) functions */ char *cgetcap(char *, const char *, int); int cgetclose(void); int cgetent(char **, char **, const char *); int cgetfirst(char **, char **); int cgetmatch(const char *, const char *); int cgetnext(char **, char **); int cgetnum(char *, const char *, long *); int cgetset(const char *); int cgetstr(char *, const char *, char **); int cgetustr(char *, const char *, char **); int daemon(int, int); int daemonfd(int, int); char *devname(__dev_t, __mode_t); char *devname_r(__dev_t, __mode_t, char *, int); char *fdevname(int); char *fdevname_r(int, char *, int); int getloadavg(double [], int); const char * getprogname(void); int heapsort(void *, size_t, size_t, int (* _Nonnull)(const void *, const void *)); #ifdef __BLOCKS__ int heapsort_b(void *, size_t, size_t, int (^ _Nonnull)(const void *, const void *)); void qsort_b(void *, size_t, size_t, int (^ _Nonnull)(const void *, const void *)); #endif int l64a_r(long, char *, int); int mergesort(void *, size_t, size_t, int (*)(const void *, const void *)); #ifdef __BLOCKS__ int mergesort_b(void *, size_t, size_t, int (^)(const void *, const void *)); #endif int mkostemp(char *, int); int mkostemps(char *, int, int); +int mkostempsat(int, char *, int, int); void qsort_r(void *, size_t, size_t, void *, int (*)(void *, const void *, const void *)); int radixsort(const unsigned char **, int, const unsigned char *, unsigned); void *reallocarray(void *, size_t, size_t) __result_use_check __alloc_size2(2, 3); void *reallocf(void *, size_t) __result_use_check __alloc_size(2); int rpmatch(const char *); void setprogname(const char *); int sradixsort(const unsigned char **, int, const unsigned char *, unsigned); void sranddev(void); void srandomdev(void); long long strtonum(const char *, long long, long long, const char **); /* Deprecated interfaces, to be removed. */ __int64_t strtoq(const char *, char **, int); __uint64_t strtouq(const char *, char **, int); extern char *suboptarg; /* getsubopt(3) external variable */ #endif /* __BSD_VISIBLE */ #if __EXT1_VISIBLE #ifndef _ERRNO_T_DEFINED #define _ERRNO_T_DEFINED typedef int errno_t; #endif /* K.3.6 */ typedef void (*constraint_handler_t)(const char * __restrict, void * __restrict, errno_t); /* K.3.6.1.1 */ constraint_handler_t set_constraint_handler_s(constraint_handler_t handler); /* K.3.6.1.2 */ _Noreturn void abort_handler_s(const char * __restrict, void * __restrict, errno_t); /* K3.6.1.3 */ void ignore_handler_s(const char * __restrict, void * __restrict, errno_t); #endif /* __EXT1_VISIBLE */ __END_DECLS __NULLABILITY_PRAGMA_POP #endif /* !_STDLIB_H_ */ Index: projects/fuse2/lib/libarchive/tests/Makefile =================================================================== --- projects/fuse2/lib/libarchive/tests/Makefile (revision 350434) +++ projects/fuse2/lib/libarchive/tests/Makefile (revision 350435) @@ -1,629 +1,628 @@ # $FreeBSD$ PACKAGE= tests _LIBARCHIVEDIR= ${SRCTOP}/contrib/libarchive ATF_TESTS_SH+= functional_test TEST_METADATA.functional_test+= timeout="600" BINDIR= ${TESTSDIR} PROGS+= libarchive_test CFLAGS+= -I${.CURDIR} -I${.CURDIR:H} -I${.OBJDIR} CFLAGS+= -I${_LIBARCHIVEDIR}/libarchive -I${_LIBARCHIVEDIR}/libarchive/test CFLAGS+= -I${_LIBARCHIVEDIR}/test_utils CFLAGS+= -DHAVE_LIBLZMA=1 -DHAVE_LZMA_H=1 # Uncomment to link against dmalloc #LDADD+= -L/usr/local/lib -ldmalloc #CFLAGS+= -I/usr/local/include -DUSE_DMALLOC .PATH: ${_LIBARCHIVEDIR}/libarchive/test TESTS_SRCS= \ test_acl_nfs4.c \ test_acl_pax.c \ test_acl_platform_nfs4.c \ test_acl_platform_posix1e.c \ test_acl_posix1e.c \ test_acl_text.c \ test_archive_api_feature.c \ test_archive_clear_error.c \ test_archive_cmdline.c \ test_archive_digest.c \ test_archive_getdate.c \ test_archive_match_time.c \ test_archive_match_owner.c \ test_archive_match_path.c \ test_archive_pathmatch.c \ test_archive_read_add_passphrase.c \ test_archive_read_close_twice.c \ test_archive_read_close_twice_open_fd.c \ test_archive_read_close_twice_open_filename.c \ test_archive_read_multiple_data_objects.c \ test_archive_read_next_header_empty.c \ test_archive_read_next_header_raw.c \ test_archive_read_open2.c \ test_archive_read_set_filter_option.c \ test_archive_read_set_format_option.c \ test_archive_read_set_option.c \ test_archive_read_set_options.c \ test_archive_read_support.c \ test_archive_set_error.c \ test_archive_string.c \ test_archive_string_conversion.c \ test_archive_write_add_filter_by_name.c \ test_archive_write_set_filter_option.c \ test_archive_write_set_format_by_name.c \ test_archive_write_set_format_filter_by_ext.c \ test_archive_write_set_format_option.c \ test_archive_write_set_option.c \ test_archive_write_set_options.c \ test_archive_write_set_passphrase.c \ test_bad_fd.c \ test_compat_bzip2.c \ test_compat_cpio.c \ test_compat_gtar.c \ test_compat_gzip.c \ test_compat_lz4.c \ test_compat_lzip.c \ test_compat_lzma.c \ test_compat_lzop.c \ test_compat_mac.c \ test_compat_perl_archive_tar.c \ test_compat_plexus_archiver_tar.c \ test_compat_solaris_tar_acl.c \ test_compat_solaris_pax_sparse.c \ test_compat_star_acl.c \ test_compat_tar_hardlink.c \ test_compat_uudecode.c \ test_compat_uudecode_large.c \ test_compat_xz.c \ test_compat_zip.c \ test_compat_zstd.c \ test_empty_write.c \ test_entry.c \ test_entry_strmode.c \ test_extattr_freebsd.c \ test_filter_count.c \ test_fuzz.c \ test_gnutar_filename_encoding.c \ test_link_resolver.c \ test_open_fd.c \ test_open_failure.c \ test_open_file.c \ test_open_filename.c \ test_pax_filename_encoding.c \ test_read_data_large.c \ test_read_disk.c \ test_read_disk_directory_traversals.c \ test_read_disk_entry_from_file.c \ test_read_extract.c \ test_read_file_nonexistent.c \ test_read_filter_compress.c \ test_read_filter_grzip.c \ test_read_filter_lrzip.c \ test_read_filter_lzop.c \ test_read_filter_lzop_multiple_parts.c \ test_read_filter_program.c \ test_read_filter_program_signature.c \ test_read_filter_uudecode.c \ test_read_format_7zip.c \ test_read_format_7zip_encryption_data.c \ test_read_format_7zip_encryption_header.c \ test_read_format_7zip_encryption_partially.c \ test_read_format_7zip_malformed.c \ test_read_format_ar.c \ test_read_format_cab.c \ test_read_format_cab_filename.c \ test_read_format_cpio_afio.c \ test_read_format_cpio_bin.c \ test_read_format_cpio_bin_Z.c \ test_read_format_cpio_bin_be.c \ test_read_format_cpio_bin_bz2.c \ test_read_format_cpio_bin_gz.c \ test_read_format_cpio_bin_le.c \ test_read_format_cpio_bin_lzip.c \ test_read_format_cpio_bin_lzma.c \ test_read_format_cpio_bin_xz.c \ test_read_format_cpio_filename.c \ test_read_format_cpio_odc.c \ test_read_format_cpio_svr4_gzip.c \ test_read_format_cpio_svr4c_Z.c \ test_read_format_cpio_svr4_bzip2_rpm.c \ test_read_format_cpio_svr4_gzip_rpm.c \ test_read_format_empty.c \ test_read_format_gtar_filename.c \ test_read_format_gtar_gz.c \ test_read_format_gtar_lzma.c \ test_read_format_gtar_sparse.c \ test_read_format_gtar_sparse_skip_entry.c \ test_read_format_iso_Z.c \ test_read_format_iso_multi_extent.c \ test_read_format_iso_xorriso.c \ test_read_format_isorr_rr_moved.c \ test_read_format_isojoliet_bz2.c \ test_read_format_isojoliet_long.c \ test_read_format_isojoliet_rr.c \ test_read_format_isojoliet_versioned.c \ test_read_format_isorr_bz2.c \ test_read_format_isorr_ce.c \ test_read_format_isorr_new_bz2.c \ test_read_format_isozisofs_bz2.c \ test_read_format_lha.c \ test_read_format_lha_bugfix_0.c \ test_read_format_lha_filename.c \ test_read_format_mtree.c \ test_read_format_mtree_crash747.c \ test_read_format_pax_bz2.c \ test_read_format_rar.c \ test_read_format_rar5.c \ test_read_format_rar_encryption_data.c \ test_read_format_rar_encryption_header.c \ test_read_format_rar_encryption_partially.c \ test_read_format_rar_invalid1.c \ test_read_format_raw.c \ test_read_format_tar.c \ test_read_format_tar_concatenated.c \ test_read_format_tar_empty_filename.c \ test_read_format_tar_empty_pax.c \ test_read_format_tar_empty_with_gnulabel.c \ test_read_format_tar_filename.c \ test_read_format_tbz.c \ test_read_format_tgz.c \ test_read_format_tlz.c \ test_read_format_txz.c \ test_read_format_tz.c \ test_read_format_ustar_filename.c \ test_read_format_warc.c \ test_read_format_xar.c \ test_read_format_zip.c \ test_read_format_zip_7075_utf8_paths.c \ test_read_format_zip_comment_stored.c \ test_read_format_zip_encryption_data.c \ test_read_format_zip_encryption_header.c \ test_read_format_zip_encryption_partially.c \ test_read_format_zip_extra_padding.c \ test_read_format_zip_filename.c \ test_read_format_zip_high_compression.c \ test_read_format_zip_jar.c \ test_read_format_zip_mac_metadata.c \ test_read_format_zip_malformed.c \ test_read_format_zip_msdos.c \ test_read_format_zip_nested.c \ test_read_format_zip_nofiletype.c \ test_read_format_zip_padded.c \ test_read_format_zip_sfx.c \ test_read_format_zip_traditional_encryption_data.c \ test_read_format_zip_winzip_aes.c \ test_read_format_zip_winzip_aes_large.c \ test_read_format_zip_with_invalid_traditional_eocd.c \ test_read_format_zip_zip64.c \ test_read_large.c \ test_read_pax_schily_xattr.c \ test_read_pax_truncated.c \ test_read_position.c \ test_read_set_format.c \ test_read_too_many_filters.c \ test_read_truncated.c \ test_read_truncated_filter.c \ test_sparse_basic.c \ test_tar_filenames.c \ test_tar_large.c \ test_warn_missing_hardlink_target.c \ test_ustar_filenames.c \ test_ustar_filename_encoding.c \ test_write_disk.c \ test_write_disk_appledouble.c \ test_write_disk_failures.c \ test_write_disk_hardlink.c \ test_write_disk_hfs_compression.c \ test_write_disk_lookup.c \ test_write_disk_mac_metadata.c \ test_write_disk_no_hfs_compression.c \ test_write_disk_perms.c \ test_write_disk_secure.c \ test_write_disk_secure744.c \ test_write_disk_secure745.c \ test_write_disk_secure746.c \ test_write_disk_sparse.c \ test_write_disk_symlink.c \ test_write_disk_times.c \ test_write_filter_b64encode.c \ test_write_filter_bzip2.c \ test_write_filter_compress.c \ test_write_filter_gzip.c \ test_write_filter_gzip_timestamp.c \ test_write_filter_lrzip.c \ test_write_filter_lz4.c \ test_write_filter_lzip.c \ test_write_filter_lzma.c \ test_write_filter_lzop.c \ test_write_filter_program.c \ test_write_filter_uuencode.c \ test_write_filter_xz.c \ test_write_filter_zstd.c \ test_write_format_7zip.c \ test_write_format_7zip_empty.c \ test_write_format_7zip_large.c \ test_write_format_ar.c \ test_write_format_cpio.c \ test_write_format_cpio_empty.c \ test_write_format_cpio_newc.c \ test_write_format_cpio_odc.c \ test_write_format_gnutar.c \ test_write_format_gnutar_filenames.c \ test_write_format_iso9660.c \ test_write_format_iso9660_boot.c \ test_write_format_iso9660_empty.c \ test_write_format_iso9660_filename.c \ test_write_format_iso9660_zisofs.c \ test_write_format_mtree.c \ test_write_format_mtree_absolute_path.c \ test_write_format_mtree_classic.c \ test_write_format_mtree_classic_indent.c \ test_write_format_mtree_fflags.c \ test_write_format_mtree_no_separator.c \ test_write_format_mtree_quoted_filename.c \ test_write_format_pax.c \ test_write_format_raw.c \ test_write_format_raw_b64.c \ test_write_format_shar_empty.c \ test_write_format_tar.c \ test_write_format_tar_empty.c \ test_write_format_tar_sparse.c \ test_write_format_tar_ustar.c \ test_write_format_tar_v7tar.c \ test_write_format_warc.c \ test_write_format_warc_empty.c \ test_write_format_xar.c \ test_write_format_xar_empty.c \ test_write_format_zip.c \ test_write_format_zip_compression_store.c \ test_write_format_zip_empty.c \ test_write_format_zip_empty_zip64.c \ test_write_format_zip_file.c \ test_write_format_zip_file_zip64.c \ test_write_format_zip_large.c \ test_write_format_zip_zip64.c \ test_write_open_memory.c \ test_write_read_format_zip.c \ test_xattr_platform.c \ test_zip_filename_encoding.c # Deterministic failures: # Crashes with SIGBUS BROKEN_TESTS+= test_archive_rmd160 # Fails with `libarchive/test/test_archive_crypto.c:121: md != actualmd` BROKEN_TESTS+= test_archive_sha384 # Fails with `test_read_disk_directory_traversals.c:1094: File at has atime 886622, 1443306049 seconds ago` BROKEN_TESTS+= test_read_disk_directory_traversals # Non-deterministic failures: # (Times out?) [and] crashes BROKEN_TESTS+= test_fuzz_rar # Build the test program. SRCS.libarchive_test= \ ${TESTS_SRCS} \ read_open_memory.c \ list.h LIBADD.libarchive_test= archive .PATH: ${_LIBARCHIVEDIR}/test_utils SRCS.libarchive_test+= test_main.c \ test_utils.c # list.h is just a list of all tests, as indicated by DEFINE_TEST macro lines list.h: ${TESTS_SRCS} Makefile @(cd ${_LIBARCHIVEDIR}/libarchive/test && \ grep -E -h ^DEFINE_TEST ${.ALLSRC:N*Makefile} | \ egrep -v '${BROKEN_TESTS:tW:C/ /|/g}') > ${.TARGET}.tmp @mv ${.TARGET}.tmp ${.TARGET} CLEANTESTS+= list.h list.h.tmp ${PACKAGE}FILES+= README ${PACKAGE}FILES+= test_acl_pax_posix1e.tar.uu ${PACKAGE}FILES+= test_acl_pax_nfs4.tar.uu ${PACKAGE}FILES+= test_archive_string_conversion.txt.Z.uu ${PACKAGE}FILES+= test_compat_bzip2_1.tbz.uu ${PACKAGE}FILES+= test_compat_bzip2_2.tbz.uu ${PACKAGE}FILES+= test_compat_cpio_1.cpio.uu ${PACKAGE}FILES+= test_compat_gtar_1.tar.uu ${PACKAGE}FILES+= test_compat_gtar_2.tar.uu ${PACKAGE}FILES+= test_compat_gzip_1.tgz.uu ${PACKAGE}FILES+= test_compat_gzip_2.tgz.uu ${PACKAGE}FILES+= test_compat_lz4_1.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_2.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_3.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4BDBX.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B5.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B5BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B6.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B6BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B7.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B7BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lzip_1.tlz.uu ${PACKAGE}FILES+= test_compat_lzip_2.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_1.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_2.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_3.tlz.uu ${PACKAGE}FILES+= test_compat_lzop_1.tar.lzo.uu ${PACKAGE}FILES+= test_compat_lzop_2.tar.lzo.uu ${PACKAGE}FILES+= test_compat_lzop_3.tar.lzo.uu ${PACKAGE}FILES+= test_compat_mac-1.tar.Z.uu ${PACKAGE}FILES+= test_compat_mac-2.tar.Z.uu ${PACKAGE}FILES+= test_compat_perl_archive_tar.tar.uu ${PACKAGE}FILES+= test_compat_plexus_archiver_tar.tar.uu ${PACKAGE}FILES+= test_compat_solaris_pax_sparse_1.pax.Z.uu ${PACKAGE}FILES+= test_compat_solaris_pax_sparse_2.pax.Z.uu ${PACKAGE}FILES+= test_compat_solaris_tar_acl.tar.uu ${PACKAGE}FILES+= test_compat_star_acl_nfs4.tar.uu ${PACKAGE}FILES+= test_compat_star_acl_posix1e.tar.uu ${PACKAGE}FILES+= test_compat_tar_hardlink_1.tar.uu ${PACKAGE}FILES+= test_compat_uudecode_large.tar.Z.uu ${PACKAGE}FILES+= test_compat_xz_1.txz.uu ${PACKAGE}FILES+= test_compat_zip_1.zip.uu ${PACKAGE}FILES+= test_compat_zip_2.zip.uu ${PACKAGE}FILES+= test_compat_zip_3.zip.uu ${PACKAGE}FILES+= test_compat_zip_4.zip.uu ${PACKAGE}FILES+= test_compat_zip_5.zip.uu ${PACKAGE}FILES+= test_compat_zip_6.zip.uu ${PACKAGE}FILES+= test_compat_zip_7.xps.uu ${PACKAGE}FILES+= test_compat_zip_8.zip.uu ${PACKAGE}FILES+= test_compat_zstd_1.tar.zst.uu ${PACKAGE}FILES+= test_fuzz.cab.uu ${PACKAGE}FILES+= test_fuzz.lzh.uu ${PACKAGE}FILES+= test_fuzz_1.iso.Z.uu ${PACKAGE}FILES+= test_pax_filename_encoding.tar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part1.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part2.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part3.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part4.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part5.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part6.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part1.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part2.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part3.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part01.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part02.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part03.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part04.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part05.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part06.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part07.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part08.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part09.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part10.rar.uu ${PACKAGE}FILES+= test_read_filter_grzip.tar.grz.uu ${PACKAGE}FILES+= test_read_filter_lrzip.tar.lrz.uu ${PACKAGE}FILES+= test_read_filter_lzop.tar.lzo.uu ${PACKAGE}FILES+= test_read_filter_lzop_multiple_parts.tar.lzo.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_lzma.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma1_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma1_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma2_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma2_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_copy.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_copy.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_copy_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_delta_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_delta_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_empty_archive.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_empty_file.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption_header.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption_partially.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_malformed.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_malformed2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_ppmd.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_symbolic_name.7z.uu ${PACKAGE}FILES+= test_read_format_ar.ar.uu ${PACKAGE}FILES+= test_read_format_cab_1.cab.uu ${PACKAGE}FILES+= test_read_format_cab_2.cab.uu ${PACKAGE}FILES+= test_read_format_cab_3.cab.uu ${PACKAGE}FILES+= test_read_format_cab_filename_cp932.cab.uu ${PACKAGE}FILES+= test_read_format_cpio_bin_be.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_bin_le.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_cp866.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_eucjp.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_koi8r.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_utf8_jp.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_utf8_ru.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_svr4_bzip2_rpm.rpm.uu ${PACKAGE}FILES+= test_read_format_cpio_svr4_gzip_rpm.rpm.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_cp866.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_eucjp.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_13.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix00.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix01.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix10.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_skip_entry.tar.Z.uu ${PACKAGE}FILES+= test_read_format_iso.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_2.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_by_nero.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_long.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_rockridge.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_multi_extent.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_ce.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_new.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_rr_moved.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_xorriso.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_zisofs.iso.Z.uu ${PACKAGE}FILES+= test_read_format_lha_bugfix_0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_filename_cp932.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header1.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header2.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header3.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh6.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh7.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_withjunk.lzh.uu ${PACKAGE}FILES+= test_read_format_mtree.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_crash747.mtree.bz2.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic2.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic3.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_noprint.mtree.uu ${PACKAGE}FILES+= test_read_format_rar.rar.uu ${PACKAGE}FILES+= test_read_format_rar_binary_data.rar.uu ${PACKAGE}FILES+= test_read_format_rar_compress_best.rar.uu ${PACKAGE}FILES+= test_read_format_rar_compress_normal.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_data.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_header.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_partially.rar.uu ${PACKAGE}FILES+= test_read_format_rar_invalid1.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multi_lzss_blocks.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0001.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0002.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0003.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0004.rar.uu ${PACKAGE}FILES+= test_read_format_rar_noeof.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_lzss_conversion.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_use_after_free.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_use_after_free2.rar.uu ${PACKAGE}FILES+= test_read_format_rar_sfx.exe.uu ${PACKAGE}FILES+= test_read_format_rar_subblock.rar.uu ${PACKAGE}FILES+= test_read_format_rar_unicode.rar.uu ${PACKAGE}FILES+= test_read_format_rar_windows.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_arm.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_arm_filter_on_window_boundary.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_blake2.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_compressed.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_different_window_size.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_distance_overflow.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_extra_field_version.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_fileattr.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_hardlink.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_invalid_dict_reference.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_leftshift1.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_leftshift2.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part01.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part02.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part03.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part04.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part05.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part06.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part07.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part08.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part01.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part02.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part03.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part04.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiple_files.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiple_files_solid.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_nonempty_dir_stream.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_owner.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_readtables_overflow.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_solid.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_stored.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_stored_manyfiles.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_symlink.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_truncated_huff.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_win32.rar.uu ${PACKAGE}FILES+= test_read_format_raw.bufr.uu ${PACKAGE}FILES+= test_read_format_raw.data.Z.uu ${PACKAGE}FILES+= test_read_format_raw.data.gz.uu ${PACKAGE}FILES+= test_read_format_raw.data.uu ${PACKAGE}FILES+= test_read_format_tar_concatenated.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_filename.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_with_gnulabel.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_pax.tar.Z.uu ${PACKAGE}FILES+= test_read_format_tar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_cp866.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_eucjp.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_warc.warc.uu ${PACKAGE}FILES+= test_read_format_zip.zip.uu ${PACKAGE}FILES+= test_read_format_zip_7075_utf8_paths.zip.uu ${PACKAGE}FILES+= test_read_format_zip_bz2_hang.zip.uu ${PACKAGE}FILES+= test_read_format_zip_bzip2.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_bzip2_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_comment_stored_1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_comment_stored_2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_data.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_header.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_partially.zip.uu ${PACKAGE}FILES+= test_read_format_zip_extra_padding.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_cp866.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_cp932.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_koi8r.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_jp.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_ru.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_ru2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_high_compression.zip.uu ${PACKAGE}FILES+= test_read_format_zip_jar.jar.uu ${PACKAGE}FILES+= test_read_format_zip_length_at_end.zip.uu ${PACKAGE}FILES+= test_read_format_zip_lzma_alone_leak.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_lzma.zipx.uu -${PACKAGE}FILES+= test_read_format_zip_lzma.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_lzma_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_mac_metadata.zip.uu ${PACKAGE}FILES+= test_read_format_zip_malformed1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_msdos.zip.uu ${PACKAGE}FILES+= test_read_format_zip_nested.zip.uu ${PACKAGE}FILES+= test_read_format_zip_nofiletype.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded3.zip.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_crash_1.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_crash_2.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_sfx.uu ${PACKAGE}FILES+= test_read_format_zip_symlink.zip.uu ${PACKAGE}FILES+= test_read_format_zip_traditional_encryption_data.zip.uu ${PACKAGE}FILES+= test_read_format_zip_ux.zip.uu ${PACKAGE}FILES+= test_read_format_zip_with_invalid_traditional_eocd.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes128.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256_large.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256_stored.zip.uu ${PACKAGE}FILES+= test_read_format_zip_xz_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_zip64a.zip.uu ${PACKAGE}FILES+= test_read_format_zip_zip64b.zip.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_aa.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ab.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ac.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ad.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ae.uu ${PACKAGE}FILES+= test_read_pax_schily_xattr.tar.uu ${PACKAGE}FILES+= test_read_splitted_rar_aa.uu ${PACKAGE}FILES+= test_read_splitted_rar_ab.uu ${PACKAGE}FILES+= test_read_splitted_rar_ac.uu ${PACKAGE}FILES+= test_read_splitted_rar_ad.uu ${PACKAGE}FILES+= test_read_too_many_filters.gz.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_aa.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_ab.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_ac.uu ${PACKAGE}FILES+= test_write_disk_appledouble.cpio.gz.uu ${PACKAGE}FILES+= test_write_disk_hfs_compression.tgz.uu ${PACKAGE}FILES+= test_write_disk_mac_metadata.tar.gz.uu ${PACKAGE}FILES+= test_write_disk_no_hfs_compression.tgz.uu .include Index: projects/fuse2/lib/libc/stdio/Makefile.inc =================================================================== --- projects/fuse2/lib/libc/stdio/Makefile.inc (revision 350434) +++ projects/fuse2/lib/libc/stdio/Makefile.inc (revision 350435) @@ -1,88 +1,88 @@ # @(#)Makefile.inc 8.3 (Berkeley) 4/17/94 # $FreeBSD$ # stdio sources .PATH: ${LIBC_SRCTOP}/stdio SRCS+= _flock_stub.c asprintf.c clrerr.c dprintf.c \ fclose.c fcloseall.c fdopen.c \ feof.c ferror.c fflush.c fgetc.c fgetln.c fgetpos.c fgets.c fgetwc.c \ fgetwln.c fgetws.c \ fileno.c findfp.c flags.c fmemopen.c fopen.c \ fopencookie.c fprintf.c fpurge.c \ fputc.c fputs.c \ fputwc.c fputws.c fread.c freopen.c fscanf.c fseek.c fsetpos.c \ ftell.c funopen.c fvwrite.c fwalk.c fwide.c fwprintf.c fwscanf.c \ fwrite.c getc.c getchar.c getdelim.c getline.c \ gets.c gets_s.c getw.c getwc.c getwchar.c makebuf.c mktemp.c \ open_memstream.c open_wmemstream.c \ perror.c printf.c printf-pos.c putc.c putchar.c \ puts.c putw.c putwc.c putwchar.c \ refill.c remove.c rewind.c rget.c scanf.c setbuf.c setbuffer.c \ setvbuf.c snprintf.c sprintf.c sscanf.c stdio.c swprintf.c swscanf.c \ tempnam.c tmpfile.c \ tmpnam.c ungetc.c ungetwc.c vasprintf.c vdprintf.c vfprintf.c \ vfscanf.c \ vfwprintf.c vfwscanf.c vprintf.c vscanf.c vsnprintf.c vsprintf.c \ vsscanf.c \ vswprintf.c vswscanf.c vwprintf.c vwscanf.c wbuf.c wprintf.c wscanf.c \ wsetup.c SRCS+= xprintf.c xprintf_float.c xprintf_int.c xprintf_str.c SRCS+= xprintf_errno.c xprintf_hexdump.c xprintf_quote.c SRCS+= xprintf_time.c xprintf_vis.c SYM_MAPS+= ${LIBC_SRCTOP}/stdio/Symbol.map MAN+= fclose.3 ferror.3 fflush.3 fgetln.3 fgets.3 fgetwln.3 fgetws.3 \ flockfile.3 \ fopen.3 fopencookie.3 fputs.3 \ fputws.3 fread.3 fseek.3 funopen.3 fwide.3 getc.3 \ getline.3 getwc.3 mktemp.3 open_memstream.3 \ printf.3 printf_l.3 putc.3 putwc.3 remove.3 scanf.3 scanf_l.3 setbuf.3 \ stdio.3 tmpnam.3 \ ungetc.3 ungetwc.3 wprintf.3 wscanf.3 MLINKS+=fclose.3 fcloseall.3 fclose.3 fdclose.3 MLINKS+=ferror.3 ferror_unlocked.3 \ ferror.3 clearerr.3 ferror.3 clearerr_unlocked.3 \ ferror.3 feof.3 ferror.3 feof_unlocked.3 \ ferror.3 fileno.3 ferror.3 fileno_unlocked.3 MLINKS+=fflush.3 fpurge.3 MLINKS+=fgets.3 gets.3 MLINKS+=fgets.3 gets_s.3 MLINKS+=flockfile.3 ftrylockfile.3 flockfile.3 funlockfile.3 MLINKS+=fopen.3 fdopen.3 fopen.3 freopen.3 fopen.3 fmemopen.3 MLINKS+=fputs.3 puts.3 MLINKS+=fread.3 fwrite.3 MLINKS+=fseek.3 fgetpos.3 fseek.3 fseeko.3 fseek.3 fsetpos.3 fseek.3 ftell.3 \ fseek.3 ftello.3 fseek.3 rewind.3 MLINKS+=funopen.3 fropen.3 funopen.3 fwopen.3 MLINKS+=getc.3 fgetc.3 getc.3 getc_unlocked.3 getc.3 getchar.3 \ getc.3 getchar_unlocked.3 getc.3 getw.3 MLINKS+=getline.3 getdelim.3 MLINKS+=getwc.3 fgetwc.3 getwc.3 getwchar.3 MLINKS+=mktemp.3 mkdtemp.3 mktemp.3 mkstemp.3 mktemp.3 mkstemps.3 \ - mktemp.3 mkostemp.3 mktemp.3 mkostemps.3 + mktemp.3 mkostemp.3 mktemp.3 mkostemps.3 mktemp.3 mkostempsat.3 MLINKS+=open_memstream.3 open_wmemstream.3 MLINKS+=printf.3 asprintf.3 printf.3 dprintf.3 printf.3 fprintf.3 \ printf.3 snprintf.3 printf.3 sprintf.3 \ printf.3 vasprintf.3 printf.3 vdprintf.3 \ printf.3 vfprintf.3 printf.3 vprintf.3 printf.3 vsnprintf.3 \ printf.3 vsprintf.3 MLINKS+=printf_l.3 asprintf_l.3 printf_l.3 fprintf_l.3 printf_l.3 snprintf_l.3 \ printf_l.3 sprintf_l.3 printf_l.3 vasprintf_l.3 printf_l.3 vfprintf_l.3 \ printf_l.3 vprintf_l.3 printf_l.3 vsnprintf_l.3 printf_l.3 vsprintf_l.3 MLINKS+=putc.3 fputc.3 putc.3 putc_unlocked.3 putc.3 putchar.3 \ putc.3 putchar_unlocked.3 putc.3 putw.3 MLINKS+=putwc.3 fputwc.3 putwc.3 putwchar.3 MLINKS+=scanf.3 fscanf.3 scanf.3 sscanf.3 scanf.3 vfscanf.3 scanf.3 vscanf.3 \ scanf.3 vsscanf.3 MLINKS+=scanf_l.3 fscanf_l.3 scanf_l.3 sscanf_l.3 scanf_l.3 vfscanf_l.3 \ scanf_l.3 vscanf_l.3 scanf_l.3 vsscanf_l.3 MLINKS+=setbuf.3 setbuffer.3 setbuf.3 setlinebuf.3 setbuf.3 setvbuf.3 MLINKS+=tmpnam.3 tempnam.3 tmpnam.3 tmpfile.3 MLINKS+=wprintf.3 fwprintf.3 wprintf.3 swprintf.3 \ wprintf.3 vwprintf.3 wprintf.3 vfwprintf.3 wprintf.3 vswprintf.3 MLINKS+=wscanf.3 fwscanf.3 wscanf.3 swscanf.3 wscanf.3 vwscanf.3 \ wscanf.3 vswscanf.3 wscanf.3 vfwscanf.3 Index: projects/fuse2/lib/libc/stdio/Symbol.map =================================================================== --- projects/fuse2/lib/libc/stdio/Symbol.map (revision 350434) +++ projects/fuse2/lib/libc/stdio/Symbol.map (revision 350435) @@ -1,212 +1,216 @@ /* * $FreeBSD$ */ FBSD_1.0 { flockfile; ftrylockfile; funlockfile; asprintf; clearerr; fclose; fcloseall; fdopen; feof; ferror; fflush; fgetc; fgetln; fgetpos; fgets; fgetwc; fgetwln; fgetws; fileno; __sF; __stdinp; __stdoutp; __stderrp; f_prealloc; /* deprecated??? */ fopen; fprintf; fpurge; fputc; fputs; fputwc; fputws; fread; freopen; fscanf; fseek; fseeko; fsetpos; ftell; ftello; funopen; fwide; fwprintf; fwrite; fwscanf; getc; getchar; gets; getw; getwc; getwchar; mkstemps; mkstemp; mkdtemp; mktemp; perror; printf; putc; putchar; puts; putw; putwc; putwchar; remove; rewind; __srget; scanf; setbuf; setbuffer; setlinebuf; setvbuf; snprintf; sprintf; sscanf; swprintf; swscanf; tempnam; tmpfile; tmpnam; ungetc; ungetwc; getchar_unlocked; getc_unlocked; putchar_unlocked; putc_unlocked; feof_unlocked; ferror_unlocked; clearerr_unlocked; fileno_unlocked; vasprintf; vfprintf; vfscanf; vfwprintf; vfwscanf; vprintf; vscanf; vsnprintf; vsprintf; vsscanf; vswprintf; vswscanf; vwprintf; vwscanf; __swbuf; wprintf; wscanf; }; FBSD_1.1 { dprintf; getdelim; getline; vdprintf; }; FBSD_1.3 { asprintf_l; fprintf_l; fwprintf_l; printf_l; snprintf_l; sprintf_l; swprintf_l; vasprintf_l; vfprintf_l; vfwprintf_l; vprintf_l; vsnprintf_l; vsprintf_l; vswprintf_l; vwprintf_l; wprintf_l; fgetwc_l; fputwc_l; ungetwc_l; vfwscanf_l; vswscanf_l; fscanf_l; fwscanf_l; scanf_l; sscanf_l; swscanf_l; vfscanf_l; vscanf_l; vsscanf_l; vwscanf_l; wscanf_l; fgetws_l; fputws_l; getwc_l; getwchar_l; putwc_l; putwchar_l; fmemopen; open_memstream; open_wmemstream; mkostemp; mkostemps; }; FBSD_1.4 { fdclose; fopencookie; }; FBSD_1.5 { gets_s; }; +FBSD_1.6 { + mkostempsat; +}; + FBSDprivate_1.0 { _flockfile; _flockfile_debug_stub; _flockfile_debug; _ftrylockfile; _funlockfile; __vfscanf; /* * xprintf support */ __use_xprintf; __lowercase_hex; __uppercase_hex; __printf_flush; __printf_puts; __printf_pad; __printf_out; __xvprintf; register_printf_function; register_printf_render; register_printf_render_std; __printf_arginfo_float; __printf_render_float; __printf_arginfo_hexdump; __printf_render_hexdump; __printf_arginfo_int; __printf_render_int; __printf_arginfo_ptr; __printf_render_ptr; __printf_arginfo_str; __printf_render_str; __printf_arginfo_chr; __printf_render_chr; __printf_arginfo_time; __printf_render_time; __printf_arginfo_vis; __printf_render_vis; }; Index: projects/fuse2/lib/libc/stdio/mktemp.3 =================================================================== --- projects/fuse2/lib/libc/stdio/mktemp.3 (revision 350434) +++ projects/fuse2/lib/libc/stdio/mktemp.3 (revision 350435) @@ -1,326 +1,349 @@ .\" Copyright (c) 1989, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)mktemp.3 8.1 (Berkeley) 6/4/93 .\" $FreeBSD$ .\" -.Dd August 8, 2013 +.Dd July 29, 2019 .Dt MKTEMP 3 .Os .Sh NAME .Nm mktemp .Nd make temporary file name (unique) .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In stdlib.h .Ft char * .Fn mktemp "char *template" .Ft int .Fn mkstemp "char *template" .Ft int .Fn mkostemp "char *template" "int oflags" .Ft int .Fn mkostemps "char *template" "int suffixlen" "int oflags" +.Ft int +.Fn mkostempsat "int dfd" "char *template" "int suffixlen" "int oflags" .Ft char * .Fn mkdtemp "char *template" .In unistd.h .Ft int .Fn mkstemps "char *template" "int suffixlen" .Sh DESCRIPTION The .Fn mktemp function takes the given file name template and overwrites a portion of it to create a file name. This file name is guaranteed not to exist at the time of function invocation and is suitable for use by the application. The template may be any file name with some number of .Ql X Ns s appended to it, for example .Pa /tmp/temp.XXXXXX . The trailing .Ql X Ns s are replaced with a unique alphanumeric combination. The number of unique file names .Fn mktemp can return depends on the number of .Ql X Ns s provided; six .Ql X Ns s will result in .Fn mktemp selecting one of 56800235584 (62 ** 6) possible temporary file names. .Pp The .Fn mkstemp function makes the same replacement to the template and creates the template file, mode 0600, returning a file descriptor opened for reading and writing. This avoids the race between testing for a file's existence and opening it for use. .Pp The .Fn mkostemp function is like .Fn mkstemp but allows specifying additional .Xr open 2 flags (defined in .In fcntl.h ) . The permitted flags are .Dv O_APPEND , .Dv O_DIRECT , .Dv O_SHLOCK , .Dv O_EXLOCK , .Dv O_SYNC and .Dv O_CLOEXEC . .Pp The .Fn mkstemps and .Fn mkostemps functions act the same as .Fn mkstemp and .Fn mkostemp respectively, except they permit a suffix to exist in the template. The template should be of the form .Pa /tmp/tmpXXXXXXsuffix . The .Fn mkstemps and .Fn mkostemps function are told the length of the suffix string. .Pp The +.Fn mkostempsat +function acts the same as +.Fn mkostemps +but takes an additional directory descriptor as a parameter. +The temporary file is created relative to the corresponding +directory, or to the current working directory if the special +value +.Dv AT_FDCWD +is specified. +If the template path is an absolute path, the +.Fa dfd +parameter is ignored and the behavior is identical to +.Fn mkostemps . +.Pp +The .Fn mkdtemp function makes the same replacement to the template as in .Fn mktemp and creates the template directory, mode 0700. .Sh RETURN VALUES The .Fn mktemp and .Fn mkdtemp functions return a pointer to the template on success and .Dv NULL on failure. The .Fn mkstemp , .Fn mkostemp .Fn mkstemps and .Fn mkostemps functions return \-1 if no suitable file could be created. If either call fails an error code is placed in the global variable .Va errno . .Sh ERRORS The .Fn mkstemp , .Fn mkostemp , .Fn mkstemps , .Fn mkostemps and .Fn mkdtemp functions may set .Va errno to one of the following values: .Bl -tag -width Er .It Bq Er ENOTDIR The pathname portion of the template is not an existing directory. .El .Pp The .Fn mkostemp and .Fn mkostemps functions may also set .Va errno to the following value: .Bl -tag -width Er .It Bq Er EINVAL The .Fa oflags argument is invalid. .El .Pp The .Fn mkstemp , .Fn mkostemp , .Fn mkstemps , .Fn mkostemps and .Fn mkdtemp functions may also set .Va errno to any value specified by the .Xr stat 2 function. .Pp The .Fn mkstemp , .Fn mkostemp , .Fn mkstemps and .Fn mkostemps functions may also set .Va errno to any value specified by the .Xr open 2 function. .Pp The .Fn mkdtemp function may also set .Va errno to any value specified by the .Xr mkdir 2 function. .Sh NOTES A common problem that results in a core dump is that the programmer passes in a read-only string to .Fn mktemp , .Fn mkstemp , .Fn mkstemps or .Fn mkdtemp . This is common with programs that were developed before .St -isoC compilers were common. For example, calling .Fn mkstemp with an argument of .Qq /tmp/tempfile.XXXXXX will result in a core dump due to .Fn mkstemp attempting to modify the string constant that was given. .Pp The .Fn mkdtemp , .Fn mkstemp and .Fn mktemp function prototypes are also available from .In unistd.h . .Sh SEE ALSO .Xr chmod 2 , .Xr getpid 2 , .Xr mkdir 2 , .Xr open 2 , .Xr stat 2 .Sh STANDARDS The .Fn mkstemp and .Fn mkdtemp functions are expected to conform to .St -p1003.1-2008 . The .Fn mktemp function is expected to conform to .St -p1003.1-2001 and is not specified by .St -p1003.1-2008 . The .Fn mkostemp , -.Fn mkstemps -and +.Fn mkstemps , .Fn mkostemps +and +.Fn mkostempsat functions do not conform to any standard. .Sh HISTORY A .Fn mktemp function appeared in .At v7 . The .Fn mkstemp function appeared in .Bx 4.4 . The .Fn mkdtemp function first appeared in .Ox 2.2 , and later in .Fx 3.2 . The .Fn mkstemps function first appeared in .Ox 2.4 , and later in .Fx 3.4 . The .Fn mkostemp and .Fn mkostemps functions appeared in .Fx 10.0 . +The +.Fn mkostempsat +function appeared in +.Fx 13.0 . .Sh BUGS This family of functions produces filenames which can be guessed, though the risk is minimized when large numbers of .Ql X Ns s are used to increase the number of possible temporary filenames. This makes the race in .Fn mktemp , between testing for a file's existence (in the .Fn mktemp function call) and opening it for use (later in the user application) particularly dangerous from a security perspective. Whenever it is possible, -.Fn mkstemp -or +.Fn mkstemp , .Fn mkostemp -should be used instead, since it does not have the race condition. +or +.Fn mkostempsat +should be used instead, since they do not have the race condition. If .Fn mkstemp cannot be used, the filename created by .Fn mktemp should be created using the .Dv O_EXCL flag to .Xr open 2 and the return status of the call should be tested for failure. This will ensure that the program does not continue blindly in the event that an attacker has already created the file with the intention of manipulating or reading its contents. Index: projects/fuse2/lib/libc/stdio/mktemp.c =================================================================== --- projects/fuse2/lib/libc/stdio/mktemp.c (revision 350434) +++ projects/fuse2/lib/libc/stdio/mktemp.c (revision 350435) @@ -1,212 +1,220 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1987, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)mktemp.c 8.1 (Berkeley) 6/4/93"; #endif /* LIBC_SCCS and not lint */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" #include #include #include #include #include #include #include #include #include #include "un-namespace.h" char *_mktemp(char *); -static int _gettemp(char *, int *, int, int, int); +static int _gettemp(int, char *, int *, int, int, int); static const unsigned char padchar[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; int +mkostempsat(int dfd, char *path, int slen, int oflags) +{ + int fd; + + return (_gettemp(dfd, path, &fd, 0, slen, oflags) ? fd : -1); +} + +int mkostemps(char *path, int slen, int oflags) { int fd; - return (_gettemp(path, &fd, 0, slen, oflags) ? fd : -1); + return (_gettemp(AT_FDCWD, path, &fd, 0, slen, oflags) ? fd : -1); } int mkstemps(char *path, int slen) { int fd; - return (_gettemp(path, &fd, 0, slen, 0) ? fd : -1); + return (_gettemp(AT_FDCWD, path, &fd, 0, slen, 0) ? fd : -1); } int mkostemp(char *path, int oflags) { int fd; - return (_gettemp(path, &fd, 0, 0, oflags) ? fd : -1); + return (_gettemp(AT_FDCWD, path, &fd, 0, 0, oflags) ? fd : -1); } int mkstemp(char *path) { int fd; - return (_gettemp(path, &fd, 0, 0, 0) ? fd : -1); + return (_gettemp(AT_FDCWD, path, &fd, 0, 0, 0) ? fd : -1); } char * mkdtemp(char *path) { - return (_gettemp(path, (int *)NULL, 1, 0, 0) ? path : (char *)NULL); + return (_gettemp(AT_FDCWD, path, (int *)NULL, 1, 0, 0) ? path : (char *)NULL); } char * _mktemp(char *path) { - return (_gettemp(path, (int *)NULL, 0, 0, 0) ? path : (char *)NULL); + return (_gettemp(AT_FDCWD, path, (int *)NULL, 0, 0, 0) ? path : (char *)NULL); } __warn_references(mktemp, "warning: mktemp() possibly used unsafely; consider using mkstemp()"); char * mktemp(char *path) { return (_mktemp(path)); } static int -_gettemp(char *path, int *doopen, int domkdir, int slen, int oflags) +_gettemp(int dfd, char *path, int *doopen, int domkdir, int slen, int oflags) { char *start, *trv, *suffp, *carryp; char *pad; struct stat sbuf; int rval; uint32_t rand; char carrybuf[MAXPATHLEN]; if ((doopen != NULL && domkdir) || slen < 0 || (oflags & ~(O_APPEND | O_DIRECT | O_SHLOCK | O_EXLOCK | O_SYNC | O_CLOEXEC)) != 0) { errno = EINVAL; return (0); } for (trv = path; *trv != '\0'; ++trv) ; if (trv - path >= MAXPATHLEN) { errno = ENAMETOOLONG; return (0); } trv -= slen; suffp = trv; --trv; if (trv < path || NULL != strchr(suffp, '/')) { errno = EINVAL; return (0); } /* Fill space with random characters */ while (trv >= path && *trv == 'X') { rand = arc4random_uniform(sizeof(padchar) - 1); *trv-- = padchar[rand]; } start = trv + 1; /* save first combination of random characters */ memcpy(carrybuf, start, suffp - start); /* * check the target directory. */ if (doopen != NULL || domkdir) { for (; trv > path; --trv) { if (*trv == '/') { *trv = '\0'; - rval = stat(path, &sbuf); + rval = fstatat(dfd, path, &sbuf, 0); *trv = '/'; if (rval != 0) return (0); if (!S_ISDIR(sbuf.st_mode)) { errno = ENOTDIR; return (0); } break; } } } + oflags |= O_CREAT | O_EXCL | O_RDWR; for (;;) { if (doopen) { - if ((*doopen = - _open(path, O_CREAT|O_EXCL|O_RDWR|oflags, 0600)) >= - 0) + *doopen = _openat(dfd, path, oflags, 0600); + if (*doopen >= 0) return (1); if (errno != EEXIST) return (0); } else if (domkdir) { if (mkdir(path, 0700) == 0) return (1); if (errno != EEXIST) return (0); } else if (lstat(path, &sbuf)) return (errno == ENOENT); /* If we have a collision, cycle through the space of filenames */ for (trv = start, carryp = carrybuf;;) { /* have we tried all possible permutations? */ if (trv == suffp) return (0); /* yes - exit with EEXIST */ pad = strchr(padchar, *trv); if (pad == NULL) { /* this should never happen */ errno = EIO; return (0); } /* increment character */ *trv = (*++pad == '\0') ? padchar[0] : *pad; /* carry to next position? */ if (*trv == *carryp) { /* increment position and loop */ ++trv; ++carryp; } else { /* try with new name */ break; } } } /*NOTREACHED*/ } Index: projects/fuse2/sbin/camcontrol/camcontrol.c =================================================================== --- projects/fuse2/sbin/camcontrol/camcontrol.c (revision 350434) +++ projects/fuse2/sbin/camcontrol/camcontrol.c (revision 350435) @@ -1,10726 +1,10742 @@ /* * Copyright (c) 1997-2007 Kenneth D. Merry * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "camcontrol.h" #ifdef WITH_NVME #include "nvmecontrol_ext.h" #endif typedef enum { CAM_CMD_NONE = 0x00000000, CAM_CMD_DEVLIST = 0x00000001, CAM_CMD_TUR = 0x00000002, CAM_CMD_INQUIRY = 0x00000003, CAM_CMD_STARTSTOP = 0x00000004, CAM_CMD_RESCAN = 0x00000005, CAM_CMD_READ_DEFECTS = 0x00000006, CAM_CMD_MODE_PAGE = 0x00000007, CAM_CMD_SCSI_CMD = 0x00000008, CAM_CMD_DEVTREE = 0x00000009, CAM_CMD_USAGE = 0x0000000a, CAM_CMD_DEBUG = 0x0000000b, CAM_CMD_RESET = 0x0000000c, CAM_CMD_FORMAT = 0x0000000d, CAM_CMD_TAG = 0x0000000e, CAM_CMD_RATE = 0x0000000f, CAM_CMD_DETACH = 0x00000010, CAM_CMD_REPORTLUNS = 0x00000011, CAM_CMD_READCAP = 0x00000012, CAM_CMD_IDENTIFY = 0x00000013, CAM_CMD_IDLE = 0x00000014, CAM_CMD_STANDBY = 0x00000015, CAM_CMD_SLEEP = 0x00000016, CAM_CMD_SMP_CMD = 0x00000017, CAM_CMD_SMP_RG = 0x00000018, CAM_CMD_SMP_PC = 0x00000019, CAM_CMD_SMP_PHYLIST = 0x0000001a, CAM_CMD_SMP_MANINFO = 0x0000001b, CAM_CMD_DOWNLOAD_FW = 0x0000001c, CAM_CMD_SECURITY = 0x0000001d, CAM_CMD_HPA = 0x0000001e, CAM_CMD_SANITIZE = 0x0000001f, CAM_CMD_PERSIST = 0x00000020, CAM_CMD_APM = 0x00000021, CAM_CMD_AAM = 0x00000022, CAM_CMD_ATTRIB = 0x00000023, CAM_CMD_OPCODES = 0x00000024, CAM_CMD_REPROBE = 0x00000025, CAM_CMD_ZONE = 0x00000026, CAM_CMD_EPC = 0x00000027, CAM_CMD_TIMESTAMP = 0x00000028, CAM_CMD_MMCSD_CMD = 0x00000029, CAM_CMD_POWER_MODE = 0x0000002a, CAM_CMD_DEVTYPE = 0x0000002b, CAM_CMD_AMA = 0x0000002c, } cam_cmdmask; typedef enum { CAM_ARG_NONE = 0x00000000, CAM_ARG_VERBOSE = 0x00000001, CAM_ARG_DEVICE = 0x00000002, CAM_ARG_BUS = 0x00000004, CAM_ARG_TARGET = 0x00000008, CAM_ARG_LUN = 0x00000010, CAM_ARG_EJECT = 0x00000020, CAM_ARG_UNIT = 0x00000040, CAM_ARG_FORMAT_BLOCK = 0x00000080, CAM_ARG_FORMAT_BFI = 0x00000100, CAM_ARG_FORMAT_PHYS = 0x00000200, CAM_ARG_PLIST = 0x00000400, CAM_ARG_GLIST = 0x00000800, CAM_ARG_GET_SERIAL = 0x00001000, CAM_ARG_GET_STDINQ = 0x00002000, CAM_ARG_GET_XFERRATE = 0x00004000, CAM_ARG_INQ_MASK = 0x00007000, CAM_ARG_TIMEOUT = 0x00020000, CAM_ARG_CMD_IN = 0x00040000, CAM_ARG_CMD_OUT = 0x00080000, CAM_ARG_ERR_RECOVER = 0x00200000, CAM_ARG_RETRIES = 0x00400000, CAM_ARG_START_UNIT = 0x00800000, CAM_ARG_DEBUG_INFO = 0x01000000, CAM_ARG_DEBUG_TRACE = 0x02000000, CAM_ARG_DEBUG_SUBTRACE = 0x04000000, CAM_ARG_DEBUG_CDB = 0x08000000, CAM_ARG_DEBUG_XPT = 0x10000000, CAM_ARG_DEBUG_PERIPH = 0x20000000, CAM_ARG_DEBUG_PROBE = 0x40000000, } cam_argmask; struct camcontrol_opts { const char *optname; uint32_t cmdnum; cam_argmask argnum; const char *subopt; }; struct ata_res_pass16 { u_int16_t reserved[5]; u_int8_t flags; u_int8_t error; u_int8_t sector_count_exp; u_int8_t sector_count; u_int8_t lba_low_exp; u_int8_t lba_low; u_int8_t lba_mid_exp; u_int8_t lba_mid; u_int8_t lba_high_exp; u_int8_t lba_high; u_int8_t device; u_int8_t status; }; struct ata_set_max_pwd { u_int16_t reserved1; u_int8_t password[32]; u_int16_t reserved2[239]; }; static struct scsi_nv task_attrs[] = { { "simple", MSG_SIMPLE_Q_TAG }, { "head", MSG_HEAD_OF_Q_TAG }, { "ordered", MSG_ORDERED_Q_TAG }, { "iwr", MSG_IGN_WIDE_RESIDUE }, { "aca", MSG_ACA_TASK } }; static const char scsicmd_opts[] = "a:c:dfi:o:r"; static const char readdefect_opts[] = "f:GPqsS:X"; static const char negotiate_opts[] = "acD:M:O:qR:T:UW:"; static const char smprg_opts[] = "l"; static const char smppc_opts[] = "a:A:d:lm:M:o:p:s:S:T:"; static const char smpphylist_opts[] = "lq"; static char pwd_opt; static struct camcontrol_opts option_table[] = { {"tur", CAM_CMD_TUR, CAM_ARG_NONE, NULL}, {"inquiry", CAM_CMD_INQUIRY, CAM_ARG_NONE, "DSR"}, {"identify", CAM_CMD_IDENTIFY, CAM_ARG_NONE, NULL}, {"start", CAM_CMD_STARTSTOP, CAM_ARG_START_UNIT, NULL}, {"stop", CAM_CMD_STARTSTOP, CAM_ARG_NONE, NULL}, {"load", CAM_CMD_STARTSTOP, CAM_ARG_START_UNIT | CAM_ARG_EJECT, NULL}, {"eject", CAM_CMD_STARTSTOP, CAM_ARG_EJECT, NULL}, {"reportluns", CAM_CMD_REPORTLUNS, CAM_ARG_NONE, "clr:"}, {"readcapacity", CAM_CMD_READCAP, CAM_ARG_NONE, "bhHlNqs"}, {"reprobe", CAM_CMD_REPROBE, CAM_ARG_NONE, NULL}, {"rescan", CAM_CMD_RESCAN, CAM_ARG_NONE, NULL}, {"reset", CAM_CMD_RESET, CAM_ARG_NONE, NULL}, {"cmd", CAM_CMD_SCSI_CMD, CAM_ARG_NONE, scsicmd_opts}, {"mmcsdcmd", CAM_CMD_MMCSD_CMD, CAM_ARG_NONE, "c:a:f:Wb:l:41S:I"}, {"command", CAM_CMD_SCSI_CMD, CAM_ARG_NONE, scsicmd_opts}, {"smpcmd", CAM_CMD_SMP_CMD, CAM_ARG_NONE, "r:R:"}, {"smprg", CAM_CMD_SMP_RG, CAM_ARG_NONE, smprg_opts}, {"smpreportgeneral", CAM_CMD_SMP_RG, CAM_ARG_NONE, smprg_opts}, {"smppc", CAM_CMD_SMP_PC, CAM_ARG_NONE, smppc_opts}, {"smpphycontrol", CAM_CMD_SMP_PC, CAM_ARG_NONE, smppc_opts}, {"smpplist", CAM_CMD_SMP_PHYLIST, CAM_ARG_NONE, smpphylist_opts}, {"smpphylist", CAM_CMD_SMP_PHYLIST, CAM_ARG_NONE, smpphylist_opts}, {"smpmaninfo", CAM_CMD_SMP_MANINFO, CAM_ARG_NONE, "l"}, {"defects", CAM_CMD_READ_DEFECTS, CAM_ARG_NONE, readdefect_opts}, {"defectlist", CAM_CMD_READ_DEFECTS, CAM_ARG_NONE, readdefect_opts}, {"devlist", CAM_CMD_DEVTREE, CAM_ARG_NONE, "-b"}, {"devtype", CAM_CMD_DEVTYPE, CAM_ARG_NONE, ""}, {"periphlist", CAM_CMD_DEVLIST, CAM_ARG_NONE, NULL}, {"modepage", CAM_CMD_MODE_PAGE, CAM_ARG_NONE, "bdelm:P:"}, {"tags", CAM_CMD_TAG, CAM_ARG_NONE, "N:q"}, {"negotiate", CAM_CMD_RATE, CAM_ARG_NONE, negotiate_opts}, {"rate", CAM_CMD_RATE, CAM_ARG_NONE, negotiate_opts}, {"debug", CAM_CMD_DEBUG, CAM_ARG_NONE, "IPTSXcp"}, {"format", CAM_CMD_FORMAT, CAM_ARG_NONE, "qrwy"}, {"sanitize", CAM_CMD_SANITIZE, CAM_ARG_NONE, "a:c:IP:qrUwy"}, {"idle", CAM_CMD_IDLE, CAM_ARG_NONE, "t:"}, {"standby", CAM_CMD_STANDBY, CAM_ARG_NONE, "t:"}, {"sleep", CAM_CMD_SLEEP, CAM_ARG_NONE, ""}, {"powermode", CAM_CMD_POWER_MODE, CAM_ARG_NONE, ""}, {"apm", CAM_CMD_APM, CAM_ARG_NONE, "l:"}, {"aam", CAM_CMD_AAM, CAM_ARG_NONE, "l:"}, {"fwdownload", CAM_CMD_DOWNLOAD_FW, CAM_ARG_NONE, "f:qsy"}, {"security", CAM_CMD_SECURITY, CAM_ARG_NONE, "d:e:fh:k:l:qs:T:U:y"}, {"hpa", CAM_CMD_HPA, CAM_ARG_NONE, "Pflp:qs:U:y"}, {"ama", CAM_CMD_AMA, CAM_ARG_NONE, "fqs:"}, {"persist", CAM_CMD_PERSIST, CAM_ARG_NONE, "ai:I:k:K:o:ps:ST:U"}, {"attrib", CAM_CMD_ATTRIB, CAM_ARG_NONE, "a:ce:F:p:r:s:T:w:V:"}, {"opcodes", CAM_CMD_OPCODES, CAM_ARG_NONE, "No:s:T"}, {"zone", CAM_CMD_ZONE, CAM_ARG_NONE, "ac:l:No:P:"}, {"epc", CAM_CMD_EPC, CAM_ARG_NONE, "c:dDeHp:Pr:sS:T:"}, {"timestamp", CAM_CMD_TIMESTAMP, CAM_ARG_NONE, "f:mrsUT:"}, {"help", CAM_CMD_USAGE, CAM_ARG_NONE, NULL}, {"-?", CAM_CMD_USAGE, CAM_ARG_NONE, NULL}, {"-h", CAM_CMD_USAGE, CAM_ARG_NONE, NULL}, {NULL, 0, 0, NULL} }; struct cam_devitem { struct device_match_result dev_match; int num_periphs; struct periph_match_result *periph_matches; struct scsi_vpd_device_id *device_id; int device_id_len; STAILQ_ENTRY(cam_devitem) links; }; struct cam_devlist { STAILQ_HEAD(, cam_devitem) dev_queue; path_id_t path_id; }; static cam_cmdmask cmdlist; static cam_argmask arglist; static const char *devtype_names[] = { "none", "scsi", "satl", "ata", "nvme", "mmcsd", "unknown", }; camcontrol_optret getoption(struct camcontrol_opts *table, char *arg, uint32_t *cmdnum, cam_argmask *argnum, const char **subopt); static int getdevlist(struct cam_device *device); static int getdevtree(int argc, char **argv, char *combinedopt); static int getdevtype(struct cam_device *device); static int print_dev_scsi(struct device_match_result *dev_result, char *tmpstr); static int print_dev_ata(struct device_match_result *dev_result, char *tmpstr); static int print_dev_semb(struct device_match_result *dev_result, char *tmpstr); static int print_dev_mmcsd(struct device_match_result *dev_result, char *tmpstr); #ifdef WITH_NVME static int print_dev_nvme(struct device_match_result *dev_result, char *tmpstr); #endif static int testunitready(struct cam_device *device, int task_attr, int retry_count, int timeout, int quiet); static int scsistart(struct cam_device *device, int startstop, int loadeject, int task_attr, int retry_count, int timeout); static int scsiinquiry(struct cam_device *device, int task_attr, int retry_count, int timeout); static int scsiserial(struct cam_device *device, int task_attr, int retry_count, int timeout); static int parse_btl(char *tstr, path_id_t *bus, target_id_t *target, lun_id_t *lun, cam_argmask *arglst); static int reprobe(struct cam_device *device); static int dorescan_or_reset(int argc, char **argv, int rescan); static int rescan_or_reset_bus(path_id_t bus, int rescan); static int scanlun_or_reset_dev(path_id_t bus, target_id_t target, lun_id_t lun, int scan); static int readdefects(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static void modepage(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int scsicmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int smpcmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int mmcsdcmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int smpreportgeneral(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int smpphycontrol(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int smpmaninfo(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int getdevid(struct cam_devitem *item); static int buildbusdevlist(struct cam_devlist *devlist); static void freebusdevlist(struct cam_devlist *devlist); static struct cam_devitem *findsasdevice(struct cam_devlist *devlist, uint64_t sasaddr); static int smpphylist(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int tagcontrol(struct cam_device *device, int argc, char **argv, char *combinedopt); static void cts_print(struct cam_device *device, struct ccb_trans_settings *cts); static void cpi_print(struct ccb_pathinq *cpi); static int get_cpi(struct cam_device *device, struct ccb_pathinq *cpi); static int get_cgd(struct cam_device *device, struct ccb_getdev *cgd); static int get_print_cts(struct cam_device *device, int user_settings, int quiet, struct ccb_trans_settings *cts); static int ratecontrol(struct cam_device *device, int task_attr, int retry_count, int timeout, int argc, char **argv, char *combinedopt); static int scsiformat(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int sanitize(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int scsireportluns(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int scsireadcapacity(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout); static int atapm(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout); static int atasecurity(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt); static int atahpa(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt); static int ataama(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt); static int scsiprintoneopcode(struct cam_device *device, int req_opcode, int sa_set, int req_sa, uint8_t *buf, uint32_t valid_len); static int scsiprintopcodes(struct cam_device *device, int td_req, uint8_t *buf, uint32_t valid_len); static int scsiopcodes(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout, int verbose); #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif #ifndef max #define max(a,b) (((a)>(b))?(a):(b)) #endif camcontrol_optret getoption(struct camcontrol_opts *table, char *arg, uint32_t *cmdnum, cam_argmask *argnum, const char **subopt) { struct camcontrol_opts *opts; int num_matches = 0; for (opts = table; (opts != NULL) && (opts->optname != NULL); opts++) { if (strncmp(opts->optname, arg, strlen(arg)) == 0) { *cmdnum = opts->cmdnum; *argnum = opts->argnum; *subopt = opts->subopt; if (++num_matches > 1) return (CC_OR_AMBIGUOUS); } } if (num_matches > 0) return (CC_OR_FOUND); else return (CC_OR_NOT_FOUND); } static int getdevlist(struct cam_device *device) { union ccb *ccb; char status[32]; int error = 0; ccb = cam_getccb(device); ccb->ccb_h.func_code = XPT_GDEVLIST; ccb->ccb_h.flags = CAM_DIR_NONE; ccb->ccb_h.retry_count = 1; ccb->cgdl.index = 0; ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS; while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) { if (cam_send_ccb(device, ccb) < 0) { perror("error getting device list"); cam_freeccb(ccb); return (1); } status[0] = '\0'; switch (ccb->cgdl.status) { case CAM_GDEVLIST_MORE_DEVS: strcpy(status, "MORE"); break; case CAM_GDEVLIST_LAST_DEVICE: strcpy(status, "LAST"); break; case CAM_GDEVLIST_LIST_CHANGED: strcpy(status, "CHANGED"); break; case CAM_GDEVLIST_ERROR: strcpy(status, "ERROR"); error = 1; break; } fprintf(stdout, "%s%d: generation: %d index: %d status: %s\n", ccb->cgdl.periph_name, ccb->cgdl.unit_number, ccb->cgdl.generation, ccb->cgdl.index, status); /* * If the list has changed, we need to start over from the * beginning. */ if (ccb->cgdl.status == CAM_GDEVLIST_LIST_CHANGED) ccb->cgdl.index = 0; } cam_freeccb(ccb); return (error); } static int getdevtree(int argc, char **argv, char *combinedopt) { union ccb ccb; int bufsize, fd; unsigned int i; int need_close = 0; int error = 0; int skip_device = 0; int busonly = 0; int c; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'b': if ((arglist & CAM_ARG_VERBOSE) == 0) busonly = 1; break; default: break; } } if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) { warn("couldn't open %s", XPT_DEVICE); return (1); } bzero(&ccb, sizeof(union ccb)); ccb.ccb_h.path_id = CAM_XPT_PATH_ID; ccb.ccb_h.target_id = CAM_TARGET_WILDCARD; ccb.ccb_h.target_lun = CAM_LUN_WILDCARD; ccb.ccb_h.func_code = XPT_DEV_MATCH; bufsize = sizeof(struct dev_match_result) * 100; ccb.cdm.match_buf_len = bufsize; ccb.cdm.matches = (struct dev_match_result *)malloc(bufsize); if (ccb.cdm.matches == NULL) { warnx("can't malloc memory for matches"); close(fd); return (1); } ccb.cdm.num_matches = 0; /* * We fetch all nodes, since we display most of them in the default * case, and all in the verbose case. */ ccb.cdm.num_patterns = 0; ccb.cdm.pattern_buf_len = 0; /* * We do the ioctl multiple times if necessary, in case there are * more than 100 nodes in the EDT. */ do { if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) { warn("error sending CAMIOCOMMAND ioctl"); error = 1; break; } if ((ccb.ccb_h.status != CAM_REQ_CMP) || ((ccb.cdm.status != CAM_DEV_MATCH_LAST) && (ccb.cdm.status != CAM_DEV_MATCH_MORE))) { warnx("got CAM error %#x, CDM error %d\n", ccb.ccb_h.status, ccb.cdm.status); error = 1; break; } for (i = 0; i < ccb.cdm.num_matches; i++) { switch (ccb.cdm.matches[i].type) { case DEV_MATCH_BUS: { struct bus_match_result *bus_result; /* * Only print the bus information if the * user turns on the verbose flag. */ if ((busonly == 0) && (arglist & CAM_ARG_VERBOSE) == 0) break; bus_result = &ccb.cdm.matches[i].result.bus_result; if (need_close) { fprintf(stdout, ")\n"); need_close = 0; } fprintf(stdout, "scbus%d on %s%d bus %d%s\n", bus_result->path_id, bus_result->dev_name, bus_result->unit_number, bus_result->bus_id, (busonly ? "" : ":")); break; } case DEV_MATCH_DEVICE: { struct device_match_result *dev_result; char tmpstr[256]; if (busonly == 1) break; dev_result = &ccb.cdm.matches[i].result.device_result; if ((dev_result->flags & DEV_RESULT_UNCONFIGURED) && ((arglist & CAM_ARG_VERBOSE) == 0)) { skip_device = 1; break; } else skip_device = 0; if (dev_result->protocol == PROTO_SCSI) { if (print_dev_scsi(dev_result, &tmpstr[0]) != 0) { skip_device = 1; break; } } else if (dev_result->protocol == PROTO_ATA || dev_result->protocol == PROTO_SATAPM) { if (print_dev_ata(dev_result, &tmpstr[0]) != 0) { skip_device = 1; break; } } else if (dev_result->protocol == PROTO_MMCSD){ if (print_dev_mmcsd(dev_result, &tmpstr[0]) != 0) { skip_device = 1; break; } } else if (dev_result->protocol == PROTO_SEMB) { if (print_dev_semb(dev_result, &tmpstr[0]) != 0) { skip_device = 1; break; } #ifdef WITH_NVME } else if (dev_result->protocol == PROTO_NVME) { if (print_dev_nvme(dev_result, &tmpstr[0]) != 0) { skip_device = 1; break; } #endif } else { sprintf(tmpstr, "<>"); } if (need_close) { fprintf(stdout, ")\n"); need_close = 0; } fprintf(stdout, "%-33s at scbus%d " "target %d lun %jx (", tmpstr, dev_result->path_id, dev_result->target_id, (uintmax_t)dev_result->target_lun); need_close = 1; break; } case DEV_MATCH_PERIPH: { struct periph_match_result *periph_result; periph_result = &ccb.cdm.matches[i].result.periph_result; if (busonly || skip_device != 0) break; if (need_close > 1) fprintf(stdout, ","); fprintf(stdout, "%s%d", periph_result->periph_name, periph_result->unit_number); need_close++; break; } default: fprintf(stdout, "unknown match type\n"); break; } } } while ((ccb.ccb_h.status == CAM_REQ_CMP) && (ccb.cdm.status == CAM_DEV_MATCH_MORE)); if (need_close) fprintf(stdout, ")\n"); close(fd); return (error); } static int getdevtype(struct cam_device *cam_dev) { camcontrol_devtype dt; int error; /* * Get the device type and report it, request no I/O be done to do this. */ error = get_device_type(cam_dev, -1, 0, 0, &dt); if (error != 0 || (unsigned)dt > CC_DT_UNKNOWN) { fprintf(stdout, "illegal\n"); return (1); } fprintf(stdout, "%s\n", devtype_names[dt]); return (0); } static int print_dev_scsi(struct device_match_result *dev_result, char *tmpstr) { char vendor[16], product[48], revision[16]; cam_strvis(vendor, dev_result->inq_data.vendor, sizeof(dev_result->inq_data.vendor), sizeof(vendor)); cam_strvis(product, dev_result->inq_data.product, sizeof(dev_result->inq_data.product), sizeof(product)); cam_strvis(revision, dev_result->inq_data.revision, sizeof(dev_result->inq_data.revision), sizeof(revision)); sprintf(tmpstr, "<%s %s %s>", vendor, product, revision); return (0); } static int print_dev_ata(struct device_match_result *dev_result, char *tmpstr) { char product[48], revision[16]; cam_strvis(product, dev_result->ident_data.model, sizeof(dev_result->ident_data.model), sizeof(product)); cam_strvis(revision, dev_result->ident_data.revision, sizeof(dev_result->ident_data.revision), sizeof(revision)); sprintf(tmpstr, "<%s %s>", product, revision); return (0); } static int print_dev_semb(struct device_match_result *dev_result, char *tmpstr) { struct sep_identify_data *sid; char vendor[16], product[48], revision[16], fw[5]; sid = (struct sep_identify_data *)&dev_result->ident_data; cam_strvis(vendor, sid->vendor_id, sizeof(sid->vendor_id), sizeof(vendor)); cam_strvis(product, sid->product_id, sizeof(sid->product_id), sizeof(product)); cam_strvis(revision, sid->product_rev, sizeof(sid->product_rev), sizeof(revision)); cam_strvis(fw, sid->firmware_rev, sizeof(sid->firmware_rev), sizeof(fw)); sprintf(tmpstr, "<%s %s %s %s>", vendor, product, revision, fw); return (0); } static int print_dev_mmcsd(struct device_match_result *dev_result, char *tmpstr) { union ccb *ccb; struct ccb_dev_advinfo *advi; struct cam_device *dev; struct mmc_params mmc_ident_data; dev = cam_open_btl(dev_result->path_id, dev_result->target_id, dev_result->target_lun, O_RDWR, NULL); if (dev == NULL) { warnx("%s", cam_errbuf); return (1); } ccb = cam_getccb(dev); if (ccb == NULL) { warnx("couldn't allocate CCB"); cam_close_device(dev); return (1); } advi = &ccb->cdai; advi->ccb_h.flags = CAM_DIR_IN; advi->ccb_h.func_code = XPT_DEV_ADVINFO; advi->flags = CDAI_FLAG_NONE; advi->buftype = CDAI_TYPE_MMC_PARAMS; advi->bufsiz = sizeof(struct mmc_params); advi->buf = (uint8_t *)&mmc_ident_data; if (cam_send_ccb(dev, ccb) < 0) { warn("error sending CAMIOCOMMAND ioctl"); cam_freeccb(ccb); cam_close_device(dev); return (1); } if (strlen(mmc_ident_data.model) > 0) { sprintf(tmpstr, "<%s>", mmc_ident_data.model); } else { sprintf(tmpstr, "<%s card>", mmc_ident_data.card_features & CARD_FEATURE_SDIO ? "SDIO" : "unknown"); } cam_freeccb(ccb); cam_close_device(dev); return (0); } #ifdef WITH_NVME static int nvme_get_cdata(struct cam_device *dev, struct nvme_controller_data *cdata) { union ccb *ccb; struct ccb_dev_advinfo *advi; ccb = cam_getccb(dev); if (ccb == NULL) { warnx("couldn't allocate CCB"); cam_close_device(dev); return (1); } advi = &ccb->cdai; advi->ccb_h.flags = CAM_DIR_IN; advi->ccb_h.func_code = XPT_DEV_ADVINFO; advi->flags = CDAI_FLAG_NONE; advi->buftype = CDAI_TYPE_NVME_CNTRL; advi->bufsiz = sizeof(struct nvme_controller_data); advi->buf = (uint8_t *)cdata; if (cam_send_ccb(dev, ccb) < 0) { warn("error sending CAMIOCOMMAND ioctl"); cam_freeccb(ccb); cam_close_device(dev); return(1); } if (advi->ccb_h.status != CAM_REQ_CMP) { warnx("got CAM error %#x", advi->ccb_h.status); cam_freeccb(ccb); cam_close_device(dev); return(1); } cam_freeccb(ccb); return 0; } static int print_dev_nvme(struct device_match_result *dev_result, char *tmpstr) { struct cam_device *dev; struct nvme_controller_data cdata; char vendor[64], product[64]; dev = cam_open_btl(dev_result->path_id, dev_result->target_id, dev_result->target_lun, O_RDWR, NULL); if (dev == NULL) { warnx("%s", cam_errbuf); return (1); } if (nvme_get_cdata(dev, &cdata)) return (1); cam_strvis(vendor, cdata.mn, sizeof(cdata.mn), sizeof(vendor)); cam_strvis(product, cdata.fr, sizeof(cdata.fr), sizeof(product)); sprintf(tmpstr, "<%s %s>", vendor, product); cam_close_device(dev); return (0); } #endif static int testunitready(struct cam_device *device, int task_attr, int retry_count, int timeout, int quiet) { int error = 0; union ccb *ccb; ccb = cam_getccb(device); scsi_test_unit_ready(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { if (quiet == 0) perror("error sending test unit ready"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); return (1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { if (quiet == 0) fprintf(stdout, "Unit is ready\n"); } else { if (quiet == 0) fprintf(stdout, "Unit is not ready\n"); error = 1; if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } cam_freeccb(ccb); return (error); } static int scsistart(struct cam_device *device, int startstop, int loadeject, int task_attr, int retry_count, int timeout) { union ccb *ccb; int error = 0; ccb = cam_getccb(device); /* * If we're stopping, send an ordered tag so the drive in question * will finish any previously queued writes before stopping. If * the device isn't capable of tagged queueing, or if tagged * queueing is turned off, the tag action is a no-op. We override * the default simple tag, although this also has the effect of * overriding the user's wishes if he wanted to specify a simple * tag. */ if ((startstop == 0) && (task_attr == MSG_SIMPLE_Q_TAG)) task_attr = MSG_ORDERED_Q_TAG; scsi_start_stop(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* start/stop */ startstop, /* load_eject */ loadeject, /* immediate */ 0, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 120000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { perror("error sending start unit"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); return (1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) if (startstop) { fprintf(stdout, "Unit started successfully"); if (loadeject) fprintf(stdout,", Media loaded\n"); else fprintf(stdout,"\n"); } else { fprintf(stdout, "Unit stopped successfully"); if (loadeject) fprintf(stdout, ", Media ejected\n"); else fprintf(stdout, "\n"); } else { error = 1; if (startstop) fprintf(stdout, "Error received from start unit command\n"); else fprintf(stdout, "Error received from stop unit command\n"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } cam_freeccb(ccb); return (error); } int scsidoinquiry(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { int c; int error = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'D': arglist |= CAM_ARG_GET_STDINQ; break; case 'R': arglist |= CAM_ARG_GET_XFERRATE; break; case 'S': arglist |= CAM_ARG_GET_SERIAL; break; default: break; } } /* * If the user didn't specify any inquiry options, he wants all of * them. */ if ((arglist & CAM_ARG_INQ_MASK) == 0) arglist |= CAM_ARG_INQ_MASK; if (arglist & CAM_ARG_GET_STDINQ) error = scsiinquiry(device, task_attr, retry_count, timeout); if (error != 0) return (error); if (arglist & CAM_ARG_GET_SERIAL) scsiserial(device, task_attr, retry_count, timeout); if (arglist & CAM_ARG_GET_XFERRATE) error = camxferrate(device); return (error); } static int scsiinquiry(struct cam_device *device, int task_attr, int retry_count, int timeout) { union ccb *ccb; struct scsi_inquiry_data *inq_buf; int error = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); return (1); } /* cam_getccb cleans up the header, caller has to zero the payload */ CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); inq_buf = (struct scsi_inquiry_data *)malloc( sizeof(struct scsi_inquiry_data)); if (inq_buf == NULL) { cam_freeccb(ccb); warnx("can't malloc memory for inquiry\n"); return (1); } bzero(inq_buf, sizeof(*inq_buf)); /* * Note that although the size of the inquiry buffer is the full * 256 bytes specified in the SCSI spec, we only tell the device * that we have allocated SHORT_INQUIRY_LENGTH bytes. There are * two reasons for this: * * - The SCSI spec says that when a length field is only 1 byte, * a value of 0 will be interpreted as 256. Therefore * scsi_inquiry() will convert an inq_len (which is passed in as * a u_int32_t, but the field in the CDB is only 1 byte) of 256 * to 0. Evidently, very few devices meet the spec in that * regard. Some devices, like many Seagate disks, take the 0 as * 0, and don't return any data. One Pioneer DVD-R drive * returns more data than the command asked for. * * So, since there are numerous devices that just don't work * right with the full inquiry size, we don't send the full size. * * - The second reason not to use the full inquiry data length is * that we don't need it here. The only reason we issue a * standard inquiry is to get the vendor name, device name, * and revision so scsi_print_inquiry() can print them. * * If, at some point in the future, more inquiry data is needed for * some reason, this code should use a procedure similar to the * probe code. i.e., issue a short inquiry, and determine from * the additional length passed back from the device how much * inquiry data the device supports. Once the amount the device * supports is determined, issue an inquiry for that amount and no * more. * * KDM, 2/18/2000 */ scsi_inquiry(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* inq_buf */ (u_int8_t *)inq_buf, /* inq_len */ SHORT_INQUIRY_LENGTH, /* evpd */ 0, /* page_code */ 0, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { perror("error sending SCSI inquiry"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); return (1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = 1; if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } cam_freeccb(ccb); if (error != 0) { free(inq_buf); return (error); } fprintf(stdout, "%s%d: ", device->device_name, device->dev_unit_num); scsi_print_inquiry(inq_buf); free(inq_buf); return (0); } static int scsiserial(struct cam_device *device, int task_attr, int retry_count, int timeout) { union ccb *ccb; struct scsi_vpd_unit_serial_number *serial_buf; char serial_num[SVPD_SERIAL_NUM_SIZE + 1]; int error = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); return (1); } /* cam_getccb cleans up the header, caller has to zero the payload */ CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); serial_buf = (struct scsi_vpd_unit_serial_number *) malloc(sizeof(*serial_buf)); if (serial_buf == NULL) { cam_freeccb(ccb); warnx("can't malloc memory for serial number"); return (1); } scsi_inquiry(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /* tag_action */ task_attr, /* inq_buf */ (u_int8_t *)serial_buf, /* inq_len */ sizeof(*serial_buf), /* evpd */ 1, /* page_code */ SVPD_UNIT_SERIAL_NUMBER, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { warn("error getting serial number"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); free(serial_buf); return (1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = 1; if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } cam_freeccb(ccb); if (error != 0) { free(serial_buf); return (error); } bcopy(serial_buf->serial_num, serial_num, serial_buf->length); serial_num[serial_buf->length] = '\0'; if ((arglist & CAM_ARG_GET_STDINQ) || (arglist & CAM_ARG_GET_XFERRATE)) fprintf(stdout, "%s%d: Serial Number ", device->device_name, device->dev_unit_num); fprintf(stdout, "%.60s\n", serial_num); free(serial_buf); return (0); } int camxferrate(struct cam_device *device) { struct ccb_pathinq cpi; u_int32_t freq = 0; u_int32_t speed = 0; union ccb *ccb; u_int mb; int retval = 0; if ((retval = get_cpi(device, &cpi)) != 0) return (1); ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts); ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS; ccb->cts.type = CTS_TYPE_CURRENT_SETTINGS; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char error_string[] = "error getting transfer settings"; if (retval < 0) warn(error_string); else warnx(error_string); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto xferrate_bailout; } speed = cpi.base_transfer_speed; freq = 0; if (ccb->cts.transport == XPORT_SPI) { struct ccb_trans_settings_spi *spi = &ccb->cts.xport_specific.spi; if ((spi->valid & CTS_SPI_VALID_SYNC_RATE) != 0) { freq = scsi_calc_syncsrate(spi->sync_period); speed = freq; } if ((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0) { speed *= (0x01 << spi->bus_width); } } else if (ccb->cts.transport == XPORT_FC) { struct ccb_trans_settings_fc *fc = &ccb->cts.xport_specific.fc; if (fc->valid & CTS_FC_VALID_SPEED) speed = fc->bitrate; } else if (ccb->cts.transport == XPORT_SAS) { struct ccb_trans_settings_sas *sas = &ccb->cts.xport_specific.sas; if (sas->valid & CTS_SAS_VALID_SPEED) speed = sas->bitrate; } else if (ccb->cts.transport == XPORT_ATA) { struct ccb_trans_settings_pata *pata = &ccb->cts.xport_specific.ata; if (pata->valid & CTS_ATA_VALID_MODE) speed = ata_mode2speed(pata->mode); } else if (ccb->cts.transport == XPORT_SATA) { struct ccb_trans_settings_sata *sata = &ccb->cts.xport_specific.sata; if (sata->valid & CTS_SATA_VALID_REVISION) speed = ata_revision2speed(sata->revision); } mb = speed / 1000; if (mb > 0) { fprintf(stdout, "%s%d: %d.%03dMB/s transfers", device->device_name, device->dev_unit_num, mb, speed % 1000); } else { fprintf(stdout, "%s%d: %dKB/s transfers", device->device_name, device->dev_unit_num, speed); } if (ccb->cts.transport == XPORT_SPI) { struct ccb_trans_settings_spi *spi = &ccb->cts.xport_specific.spi; if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0) && (spi->sync_offset != 0)) fprintf(stdout, " (%d.%03dMHz, offset %d", freq / 1000, freq % 1000, spi->sync_offset); if (((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0) && (spi->bus_width > 0)) { if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0) && (spi->sync_offset != 0)) { fprintf(stdout, ", "); } else { fprintf(stdout, " ("); } fprintf(stdout, "%dbit)", 8 * (0x01 << spi->bus_width)); } else if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0) && (spi->sync_offset != 0)) { fprintf(stdout, ")"); } } else if (ccb->cts.transport == XPORT_ATA) { struct ccb_trans_settings_pata *pata = &ccb->cts.xport_specific.ata; printf(" ("); if (pata->valid & CTS_ATA_VALID_MODE) printf("%s, ", ata_mode2string(pata->mode)); if ((pata->valid & CTS_ATA_VALID_ATAPI) && pata->atapi != 0) printf("ATAPI %dbytes, ", pata->atapi); if (pata->valid & CTS_ATA_VALID_BYTECOUNT) printf("PIO %dbytes", pata->bytecount); printf(")"); } else if (ccb->cts.transport == XPORT_SATA) { struct ccb_trans_settings_sata *sata = &ccb->cts.xport_specific.sata; printf(" ("); if (sata->valid & CTS_SATA_VALID_REVISION) printf("SATA %d.x, ", sata->revision); else printf("SATA, "); if (sata->valid & CTS_SATA_VALID_MODE) printf("%s, ", ata_mode2string(sata->mode)); if ((sata->valid & CTS_SATA_VALID_ATAPI) && sata->atapi != 0) printf("ATAPI %dbytes, ", sata->atapi); if (sata->valid & CTS_SATA_VALID_BYTECOUNT) printf("PIO %dbytes", sata->bytecount); printf(")"); } if (ccb->cts.protocol == PROTO_SCSI) { struct ccb_trans_settings_scsi *scsi = &ccb->cts.proto_specific.scsi; if (scsi->valid & CTS_SCSI_VALID_TQ) { if (scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) { fprintf(stdout, ", Command Queueing Enabled"); } } } fprintf(stdout, "\n"); xferrate_bailout: cam_freeccb(ccb); return (retval); } static void atahpa_print(struct ata_params *parm, u_int64_t hpasize, int header) { u_int32_t lbasize = (u_int32_t)parm->lba_size_1 | ((u_int32_t)parm->lba_size_2 << 16); u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) | ((u_int64_t)parm->lba_size48_2 << 16) | ((u_int64_t)parm->lba_size48_3 << 32) | ((u_int64_t)parm->lba_size48_4 << 48); if (header) { printf("\nFeature " "Support Enabled Value\n"); } printf("Host Protected Area (HPA) "); if (parm->support.command1 & ATA_SUPPORT_PROTECTED) { u_int64_t lba = lbasize48 ? lbasize48 : lbasize; printf("yes %s %ju/%ju\n", (hpasize > lba) ? "yes" : "no ", lba, hpasize); printf("HPA - Security "); if (parm->support.command2 & ATA_SUPPORT_MAXSECURITY) printf("yes %s\n", (parm->enabled.command2 & ATA_SUPPORT_MAXSECURITY) ? "yes" : "no "); else printf("no\n"); } else { printf("no\n"); } } static void ataama_print(struct ata_params *parm, u_int64_t nativesize, int header) { u_int32_t lbasize = (u_int32_t)parm->lba_size_1 | ((u_int32_t)parm->lba_size_2 << 16); u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) | ((u_int64_t)parm->lba_size48_2 << 16) | ((u_int64_t)parm->lba_size48_3 << 32) | ((u_int64_t)parm->lba_size48_4 << 48); if (header) { printf("\nFeature " "Support Enabled Value\n"); } printf("Accessible Max Address Config "); if (parm->support2 & ATA_SUPPORT_AMAX_ADDR) { u_int64_t lba = lbasize48 ? lbasize48 : lbasize; printf("yes %s %ju/%ju\n", (nativesize > lba) ? "yes" : "no ", lba, nativesize); } else { printf("no\n"); } } static int atasata(struct ata_params *parm) { if (parm->satacapabilities != 0xffff && parm->satacapabilities != 0x0000) return 1; return 0; } static void atacapprint(struct ata_params *parm) { const char *proto; u_int32_t lbasize = (u_int32_t)parm->lba_size_1 | ((u_int32_t)parm->lba_size_2 << 16); u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) | ((u_int64_t)parm->lba_size48_2 << 16) | ((u_int64_t)parm->lba_size48_3 << 32) | ((u_int64_t)parm->lba_size48_4 << 48); printf("\n"); printf("protocol "); proto = (parm->config == ATA_PROTO_CFA) ? "CFA" : (parm->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA"; if (ata_version(parm->version_major) == 0) { printf("%s", proto); } else if (ata_version(parm->version_major) <= 7) { printf("%s-%d", proto, ata_version(parm->version_major)); } else if (ata_version(parm->version_major) == 8) { printf("%s8-ACS", proto); } else { printf("ACS-%d %s", ata_version(parm->version_major) - 7, proto); } if (parm->satacapabilities && parm->satacapabilities != 0xffff) { if (parm->satacapabilities & ATA_SATA_GEN3) printf(" SATA 3.x\n"); else if (parm->satacapabilities & ATA_SATA_GEN2) printf(" SATA 2.x\n"); else if (parm->satacapabilities & ATA_SATA_GEN1) printf(" SATA 1.x\n"); else printf(" SATA\n"); } else printf("\n"); printf("device model %.40s\n", parm->model); printf("firmware revision %.8s\n", parm->revision); printf("serial number %.20s\n", parm->serial); if (parm->enabled.extension & ATA_SUPPORT_64BITWWN) { printf("WWN %04x%04x%04x%04x\n", parm->wwn[0], parm->wwn[1], parm->wwn[2], parm->wwn[3]); } + printf("additional product id %.8s\n", parm->product_id); if (parm->enabled.extension & ATA_SUPPORT_MEDIASN) { printf("media serial number %.30s\n", parm->media_serial); } printf("cylinders %d\n", parm->cylinders); printf("heads %d\n", parm->heads); printf("sectors/track %d\n", parm->sectors); printf("sector size logical %u, physical %lu, offset %lu\n", ata_logical_sector_size(parm), (unsigned long)ata_physical_sector_size(parm), (unsigned long)ata_logical_sector_offset(parm)); if (parm->config == ATA_PROTO_CFA || (parm->support.command2 & ATA_SUPPORT_CFA)) printf("CFA supported\n"); printf("LBA%ssupported ", parm->capabilities1 & ATA_SUPPORT_LBA ? " " : " not "); if (lbasize) printf("%d sectors\n", lbasize); else printf("\n"); printf("LBA48%ssupported ", parm->support.command2 & ATA_SUPPORT_ADDRESS48 ? " " : " not "); if (lbasize48) printf("%ju sectors\n", (uintmax_t)lbasize48); else printf("\n"); printf("PIO supported PIO"); switch (ata_max_pmode(parm)) { case ATA_PIO4: printf("4"); break; case ATA_PIO3: printf("3"); break; case ATA_PIO2: printf("2"); break; case ATA_PIO1: printf("1"); break; default: printf("0"); } if ((parm->capabilities1 & ATA_SUPPORT_IORDY) == 0) printf(" w/o IORDY"); printf("\n"); printf("DMA%ssupported ", parm->capabilities1 & ATA_SUPPORT_DMA ? " " : " not "); if (parm->capabilities1 & ATA_SUPPORT_DMA) { if (parm->mwdmamodes & 0xff) { printf("WDMA"); if (parm->mwdmamodes & 0x04) printf("2"); else if (parm->mwdmamodes & 0x02) printf("1"); else if (parm->mwdmamodes & 0x01) printf("0"); printf(" "); } if ((parm->atavalid & ATA_FLAG_88) && (parm->udmamodes & 0xff)) { printf("UDMA"); if (parm->udmamodes & 0x40) printf("6"); else if (parm->udmamodes & 0x20) printf("5"); else if (parm->udmamodes & 0x10) printf("4"); else if (parm->udmamodes & 0x08) printf("3"); else if (parm->udmamodes & 0x04) printf("2"); else if (parm->udmamodes & 0x02) printf("1"); else if (parm->udmamodes & 0x01) printf("0"); printf(" "); } } printf("\n"); if (parm->media_rotation_rate == 1) { printf("media RPM non-rotating\n"); } else if (parm->media_rotation_rate >= 0x0401 && parm->media_rotation_rate <= 0xFFFE) { printf("media RPM %d\n", parm->media_rotation_rate); } printf("Zoned-Device Commands "); switch (parm->support3 & ATA_SUPPORT_ZONE_MASK) { case ATA_SUPPORT_ZONE_DEV_MANAGED: printf("device managed\n"); break; case ATA_SUPPORT_ZONE_HOST_AWARE: printf("host aware\n"); break; default: printf("no\n"); } printf("\nFeature " "Support Enabled Value Vendor\n"); printf("read ahead %s %s\n", parm->support.command1 & ATA_SUPPORT_LOOKAHEAD ? "yes" : "no", parm->enabled.command1 & ATA_SUPPORT_LOOKAHEAD ? "yes" : "no"); printf("write cache %s %s\n", parm->support.command1 & ATA_SUPPORT_WRITECACHE ? "yes" : "no", parm->enabled.command1 & ATA_SUPPORT_WRITECACHE ? "yes" : "no"); printf("flush cache %s %s\n", parm->support.command2 & ATA_SUPPORT_FLUSHCACHE ? "yes" : "no", parm->enabled.command2 & ATA_SUPPORT_FLUSHCACHE ? "yes" : "no"); printf("overlap %s\n", parm->capabilities1 & ATA_SUPPORT_OVERLAP ? "yes" : "no"); printf("Tagged Command Queuing (TCQ) %s %s", parm->support.command2 & ATA_SUPPORT_QUEUED ? "yes" : "no", parm->enabled.command2 & ATA_SUPPORT_QUEUED ? "yes" : "no"); if (parm->support.command2 & ATA_SUPPORT_QUEUED) { printf(" %d tags\n", ATA_QUEUE_LEN(parm->queue) + 1); } else printf("\n"); printf("Native Command Queuing (NCQ) "); - if (parm->satacapabilities != 0xffff && - (parm->satacapabilities & ATA_SUPPORT_NCQ)) { + if (atasata(parm) && (parm->satacapabilities & ATA_SUPPORT_NCQ)) { printf("yes %d tags\n", ATA_QUEUE_LEN(parm->queue) + 1); + printf("NCQ Priority Information %s\n", + parm->satacapabilities & ATA_SUPPORT_NCQ_PRIO ? + "yes" : "no"); + printf("NCQ Non-Data Command %s\n", + parm->satacapabilities2 & ATA_SUPPORT_NCQ_NON_DATA ? + "yes" : "no"); + printf("NCQ Streaming %s\n", + parm->satacapabilities2 & ATA_SUPPORT_NCQ_STREAM ? + "yes" : "no"); + printf("Receive & Send FPDMA Queued %s\n", + parm->satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED ? + "yes" : "no"); + printf("NCQ Autosense %s\n", + parm->satasupport & ATA_SUPPORT_NCQ_AUTOSENSE ? + "yes" : "no"); } else printf("no\n"); - printf("NCQ Queue Management %s\n", atasata(parm) && - parm->satacapabilities2 & ATA_SUPPORT_NCQ_QMANAGEMENT ? - "yes" : "no"); - printf("NCQ Streaming %s\n", atasata(parm) && - parm->satacapabilities2 & ATA_SUPPORT_NCQ_STREAM ? - "yes" : "no"); - printf("Receive & Send FPDMA Queued %s\n", atasata(parm) && - parm->satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED ? - "yes" : "no"); - printf("SMART %s %s\n", parm->support.command1 & ATA_SUPPORT_SMART ? "yes" : "no", parm->enabled.command1 & ATA_SUPPORT_SMART ? "yes" : "no"); - printf("microcode download %s %s\n", - parm->support.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no", - parm->enabled.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no"); printf("security %s %s\n", parm->support.command1 & ATA_SUPPORT_SECURITY ? "yes" : "no", parm->enabled.command1 & ATA_SUPPORT_SECURITY ? "yes" : "no"); printf("power management %s %s\n", parm->support.command1 & ATA_SUPPORT_POWERMGT ? "yes" : "no", parm->enabled.command1 & ATA_SUPPORT_POWERMGT ? "yes" : "no"); + printf("microcode download %s %s\n", + parm->support.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no", + parm->enabled.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no"); printf("advanced power management %s %s", parm->support.command2 & ATA_SUPPORT_APM ? "yes" : "no", parm->enabled.command2 & ATA_SUPPORT_APM ? "yes" : "no"); if (parm->support.command2 & ATA_SUPPORT_APM) { printf(" %d/0x%02X\n", parm->apm_value & 0xff, parm->apm_value & 0xff); } else printf("\n"); printf("automatic acoustic management %s %s", parm->support.command2 & ATA_SUPPORT_AUTOACOUSTIC ? "yes" :"no", parm->enabled.command2 & ATA_SUPPORT_AUTOACOUSTIC ? "yes" :"no"); if (parm->support.command2 & ATA_SUPPORT_AUTOACOUSTIC) { printf(" %d/0x%02X %d/0x%02X\n", ATA_ACOUSTIC_CURRENT(parm->acoustic), ATA_ACOUSTIC_CURRENT(parm->acoustic), ATA_ACOUSTIC_VENDOR(parm->acoustic), ATA_ACOUSTIC_VENDOR(parm->acoustic)); } else printf("\n"); printf("media status notification %s %s\n", parm->support.command2 & ATA_SUPPORT_NOTIFY ? "yes" : "no", parm->enabled.command2 & ATA_SUPPORT_NOTIFY ? "yes" : "no"); printf("power-up in Standby %s %s\n", parm->support.command2 & ATA_SUPPORT_STANDBY ? "yes" : "no", parm->enabled.command2 & ATA_SUPPORT_STANDBY ? "yes" : "no"); printf("write-read-verify %s %s", parm->support2 & ATA_SUPPORT_WRITEREADVERIFY ? "yes" : "no", parm->enabled2 & ATA_SUPPORT_WRITEREADVERIFY ? "yes" : "no"); if (parm->support2 & ATA_SUPPORT_WRITEREADVERIFY) { printf(" %d/0x%x\n", parm->wrv_mode, parm->wrv_mode); } else printf("\n"); printf("unload %s %s\n", parm->support.extension & ATA_SUPPORT_UNLOAD ? "yes" : "no", parm->enabled.extension & ATA_SUPPORT_UNLOAD ? "yes" : "no"); printf("general purpose logging %s %s\n", parm->support.extension & ATA_SUPPORT_GENLOG ? "yes" : "no", parm->enabled.extension & ATA_SUPPORT_GENLOG ? "yes" : "no"); printf("free-fall %s %s\n", parm->support2 & ATA_SUPPORT_FREEFALL ? "yes" : "no", parm->enabled2 & ATA_SUPPORT_FREEFALL ? "yes" : "no"); + printf("sense data reporting %s %s\n", + parm->support2 & ATA_SUPPORT_SENSE_REPORT ? "yes" : "no", + parm->enabled2 & ATA_SUPPORT_SENSE_REPORT ? "yes" : "no"); + printf("extended power conditions %s %s\n", + parm->support2 & ATA_SUPPORT_EPC ? "yes" : "no", + parm->enabled2 & ATA_SUPPORT_EPC ? "yes" : "no"); + printf("device statistics notification %s %s\n", + parm->support2 & ATA_SUPPORT_DSN ? "yes" : "no", + parm->enabled2 & ATA_SUPPORT_DSN ? "yes" : "no"); printf("Data Set Management (DSM/TRIM) "); if (parm->support_dsm & ATA_SUPPORT_DSM_TRIM) { printf("yes\n"); printf("DSM - max 512byte blocks "); if (parm->max_dsm_blocks == 0x00) printf("yes not specified\n"); else printf("yes %d\n", parm->max_dsm_blocks); printf("DSM - deterministic read "); if (parm->support3 & ATA_SUPPORT_DRAT) { if (parm->support3 & ATA_SUPPORT_RZAT) printf("yes zeroed\n"); else printf("yes any value\n"); } else { printf("no\n"); } } else { printf("no\n"); } + printf("encrypts all user data %s\n", + parm->support3 & ATA_ENCRYPTS_ALL_USER_DATA ? "yes" : "no"); printf("Sanitize "); if (parm->multi & ATA_SUPPORT_SANITIZE) { printf("yes\t\t%s%s%s\n", parm->multi & ATA_SUPPORT_BLOCK_ERASE_EXT ? "block, " : "", parm->multi & ATA_SUPPORT_OVERWRITE_EXT ? "overwrite, " : "", parm->multi & ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT ? "crypto" : ""); printf("Sanitize - commands allowed %s\n", parm->multi & ATA_SUPPORT_SANITIZE_ALLOWED ? "yes" : "no"); printf("Sanitize - antifreeze lock %s\n", parm->multi & ATA_SUPPORT_ANTIFREEZE_LOCK_EXT ? "yes" : "no"); } else { printf("no\n"); } } static int scsi_cam_pass_16_send(struct cam_device *device, union ccb *ccb, int quiet) { struct ata_pass_16 *ata_pass_16; struct ata_cmd ata_cmd; ata_pass_16 = (struct ata_pass_16 *)ccb->csio.cdb_io.cdb_bytes; ata_cmd.command = ata_pass_16->command; ata_cmd.control = ata_pass_16->control; ata_cmd.features = ata_pass_16->features; if (arglist & CAM_ARG_VERBOSE) { warnx("sending ATA %s via pass_16 with timeout of %u msecs", ata_op_string(&ata_cmd), ccb->csio.ccb_h.timeout); } /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { if (quiet != 1 || arglist & CAM_ARG_VERBOSE) { warn("error sending ATA %s via pass_16", ata_op_string(&ata_cmd)); } if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } if (!(ata_pass_16->flags & AP_FLAG_CHK_COND) && (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (quiet != 1 || arglist & CAM_ARG_VERBOSE) { warnx("ATA %s via pass_16 failed", ata_op_string(&ata_cmd)); } if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } return (0); } static int ata_cam_send(struct cam_device *device, union ccb *ccb, int quiet) { if (arglist & CAM_ARG_VERBOSE) { warnx("sending ATA %s with timeout of %u msecs", ata_op_string(&(ccb->ataio.cmd)), ccb->ataio.ccb_h.timeout); } /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { if (quiet != 1 || arglist & CAM_ARG_VERBOSE) { warn("error sending ATA %s", ata_op_string(&(ccb->ataio.cmd))); } if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (quiet != 1 || arglist & CAM_ARG_VERBOSE) { warnx("ATA %s failed: %d", ata_op_string(&(ccb->ataio.cmd)), quiet); } if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } return (0); } static int ata_do_pass_16(struct cam_device *device, union ccb *ccb, int retries, u_int32_t flags, u_int8_t protocol, u_int8_t ata_flags, u_int8_t tag_action, u_int8_t command, u_int8_t features, u_int64_t lba, u_int8_t sector_count, u_int8_t *data_ptr, u_int16_t dxfer_len, int timeout, int quiet) { if (data_ptr != NULL) { ata_flags |= AP_FLAG_BYT_BLOK_BYTES | AP_FLAG_TLEN_SECT_CNT; if (flags & CAM_DIR_OUT) ata_flags |= AP_FLAG_TDIR_TO_DEV; else ata_flags |= AP_FLAG_TDIR_FROM_DEV; } else { ata_flags |= AP_FLAG_TLEN_NO_DATA; } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_ata_pass_16(&ccb->csio, retries, NULL, flags, tag_action, protocol, ata_flags, features, sector_count, lba, command, /*control*/0, data_ptr, dxfer_len, /*sense_len*/SSD_FULL_SIZE, timeout); return scsi_cam_pass_16_send(device, ccb, quiet); } static int ata_try_pass_16(struct cam_device *device) { struct ccb_pathinq cpi; if (get_cpi(device, &cpi) != 0) { warnx("couldn't get CPI"); return (-1); } if (cpi.protocol == PROTO_SCSI) { /* possibly compatible with pass_16 */ return (1); } /* likely not compatible with pass_16 */ return (0); } static int ata_do_28bit_cmd(struct cam_device *device, union ccb *ccb, int retries, u_int32_t flags, u_int8_t protocol, u_int8_t tag_action, u_int8_t command, u_int8_t features, u_int32_t lba, u_int8_t sector_count, u_int8_t *data_ptr, u_int16_t dxfer_len, int timeout, int quiet) { switch (ata_try_pass_16(device)) { case -1: return (1); case 1: /* Try using SCSI Passthrough */ return ata_do_pass_16(device, ccb, retries, flags, protocol, 0, tag_action, command, features, lba, sector_count, data_ptr, dxfer_len, timeout, quiet); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->ataio); cam_fill_ataio(&ccb->ataio, retries, NULL, flags, tag_action, data_ptr, dxfer_len, timeout); ata_28bit_cmd(&ccb->ataio, command, features, lba, sector_count); return ata_cam_send(device, ccb, quiet); } static int ata_do_cmd(struct cam_device *device, union ccb *ccb, int retries, u_int32_t flags, u_int8_t protocol, u_int8_t ata_flags, u_int8_t tag_action, u_int8_t command, u_int8_t features, u_int64_t lba, u_int8_t sector_count, u_int8_t *data_ptr, u_int16_t dxfer_len, int timeout, int force48bit) { int retval; retval = ata_try_pass_16(device); if (retval == -1) return (1); if (retval == 1) { int error; /* Try using SCSI Passthrough */ error = ata_do_pass_16(device, ccb, retries, flags, protocol, ata_flags, tag_action, command, features, lba, sector_count, data_ptr, dxfer_len, timeout, 0); if (ata_flags & AP_FLAG_CHK_COND) { /* Decode ata_res from sense data */ struct ata_res_pass16 *res_pass16; struct ata_res *res; u_int i; u_int16_t *ptr; /* sense_data is 4 byte aligned */ ptr = (uint16_t*)(uintptr_t)&ccb->csio.sense_data; for (i = 0; i < sizeof(*res_pass16) / 2; i++) ptr[i] = le16toh(ptr[i]); /* sense_data is 4 byte aligned */ res_pass16 = (struct ata_res_pass16 *)(uintptr_t) &ccb->csio.sense_data; res = &ccb->ataio.res; res->flags = res_pass16->flags; res->status = res_pass16->status; res->error = res_pass16->error; res->lba_low = res_pass16->lba_low; res->lba_mid = res_pass16->lba_mid; res->lba_high = res_pass16->lba_high; res->device = res_pass16->device; res->lba_low_exp = res_pass16->lba_low_exp; res->lba_mid_exp = res_pass16->lba_mid_exp; res->lba_high_exp = res_pass16->lba_high_exp; res->sector_count = res_pass16->sector_count; res->sector_count_exp = res_pass16->sector_count_exp; ccb->ccb_h.status &= ~CAM_STATUS_MASK; if (res->status & ATA_STATUS_ERROR) ccb->ccb_h.status |= CAM_ATA_STATUS_ERROR; else ccb->ccb_h.status |= CAM_REQ_CMP; } return (error); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->ataio); cam_fill_ataio(&ccb->ataio, retries, NULL, flags, tag_action, data_ptr, dxfer_len, timeout); if (force48bit || lba > ATA_MAX_28BIT_LBA) ata_48bit_cmd(&ccb->ataio, command, features, lba, sector_count); else ata_28bit_cmd(&ccb->ataio, command, features, lba, sector_count); if (ata_flags & AP_FLAG_CHK_COND) ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT; return ata_cam_send(device, ccb, 0); } static void dump_data(uint16_t *ptr, uint32_t len) { u_int i; for (i = 0; i < len / 2; i++) { if ((i % 8) == 0) printf(" %3d: ", i); printf("%04hx ", ptr[i]); if ((i % 8) == 7) printf("\n"); } if ((i % 8) != 7) printf("\n"); } static int atahpa_proc_resp(struct cam_device *device, union ccb *ccb, int is48bit, u_int64_t *hpasize) { struct ata_res *res; res = &ccb->ataio.res; if (res->status & ATA_STATUS_ERROR) { if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); printf("error = 0x%02x, sector_count = 0x%04x, " "device = 0x%02x, status = 0x%02x\n", res->error, res->sector_count, res->device, res->status); } if (res->error & ATA_ERROR_ID_NOT_FOUND) { warnx("Max address has already been set since " "last power-on or hardware reset"); } return (1); } if (arglist & CAM_ARG_VERBOSE) { fprintf(stdout, "%s%d: Raw native max data:\n", device->device_name, device->dev_unit_num); /* res is 4 byte aligned */ dump_data((uint16_t*)(uintptr_t)res, sizeof(struct ata_res)); printf("error = 0x%02x, sector_count = 0x%04x, device = 0x%02x, " "status = 0x%02x\n", res->error, res->sector_count, res->device, res->status); } if (hpasize != NULL) { if (is48bit) { *hpasize = (((u_int64_t)((res->lba_high_exp << 16) | (res->lba_mid_exp << 8) | res->lba_low_exp) << 24) | ((res->lba_high << 16) | (res->lba_mid << 8) | res->lba_low)) + 1; } else { *hpasize = (((res->device & 0x0f) << 24) | (res->lba_high << 16) | (res->lba_mid << 8) | res->lba_low) + 1; } } return (0); } static int ata_read_native_max(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, struct ata_params *parm, u_int64_t *hpasize) { int error; u_int cmd, is48bit; u_int8_t protocol; is48bit = parm->support.command2 & ATA_SUPPORT_ADDRESS48; protocol = AP_PROTO_NON_DATA; if (is48bit) { cmd = ATA_READ_NATIVE_MAX_ADDRESS48; protocol |= AP_EXTEND; } else { cmd = ATA_READ_NATIVE_MAX_ADDRESS; } error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 5000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, hpasize); } static int atahpa_set_max(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, int is48bit, u_int64_t maxsize, int persist) { int error; u_int cmd; u_int8_t protocol; protocol = AP_PROTO_NON_DATA; if (is48bit) { cmd = ATA_SET_MAX_ADDRESS48; protocol |= AP_EXTEND; } else { cmd = ATA_SET_MAX_ADDRESS; } /* lba's are zero indexed so the max lba is requested max - 1 */ if (maxsize) maxsize--; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/ATA_HPA_FEAT_MAX_ADDR, /*lba*/maxsize, /*sector_count*/persist, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 1000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, NULL); } static int atahpa_password(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, int is48bit, struct ata_set_max_pwd *pwd) { int error; u_int cmd; u_int8_t protocol; protocol = AP_PROTO_PIO_OUT; cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/ATA_HPA_FEAT_SET_PWD, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t*)pwd, /*dxfer_len*/sizeof(struct ata_set_max_pwd), timeout ? timeout : 1000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, NULL); } static int atahpa_lock(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, int is48bit) { int error; u_int cmd; u_int8_t protocol; protocol = AP_PROTO_NON_DATA; cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/ATA_HPA_FEAT_LOCK, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 1000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, NULL); } static int atahpa_unlock(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, int is48bit, struct ata_set_max_pwd *pwd) { int error; u_int cmd; u_int8_t protocol; protocol = AP_PROTO_PIO_OUT; cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/ATA_HPA_FEAT_UNLOCK, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t*)pwd, /*dxfer_len*/sizeof(struct ata_set_max_pwd), timeout ? timeout : 1000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, NULL); } static int atahpa_freeze_lock(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, int is48bit) { int error; u_int cmd; u_int8_t protocol; protocol = AP_PROTO_NON_DATA; cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/protocol, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/ATA_HPA_FEAT_FREEZE, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 1000, is48bit); if (error) return (error); return atahpa_proc_resp(device, ccb, is48bit, NULL); } static int ata_get_native_max(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, u_int64_t *nativesize) { int error; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_AMAX_ADDR, /*features*/ATA_AMAX_ADDR_GET, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 30 * 1000, /*force48bit*/1); if (error) return (error); return atahpa_proc_resp(device, ccb, /*is48bit*/1, nativesize); } static int ataama_set(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb, u_int64_t maxsize) { int error; /* lba's are zero indexed so the max lba is requested max - 1 */ if (maxsize) maxsize--; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_AMAX_ADDR, /*features*/ATA_AMAX_ADDR_SET, /*lba*/maxsize, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 30 * 1000, /*force48bit*/1); if (error) return (error); return atahpa_proc_resp(device, ccb, /*is48bit*/1, NULL); } static int ataama_freeze(struct cam_device *device, int retry_count, u_int32_t timeout, union ccb *ccb) { int error; error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_AMAX_ADDR, /*features*/ATA_AMAX_ADDR_FREEZE, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, timeout ? timeout : 30 * 1000, /*force48bit*/1); if (error) return (error); return atahpa_proc_resp(device, ccb, /*is48bit*/1, NULL); } int ata_do_identify(struct cam_device *device, int retry_count, int timeout, union ccb *ccb, struct ata_params** ident_bufp) { struct ata_params *ident_buf; struct ccb_pathinq cpi; struct ccb_getdev cgd; u_int i, error; int16_t *ptr; u_int8_t command, retry_command; if (get_cpi(device, &cpi) != 0) { warnx("couldn't get CPI"); return (-1); } /* Neither PROTO_ATAPI or PROTO_SATAPM are used in cpi.protocol */ if (cpi.protocol == PROTO_ATA) { if (get_cgd(device, &cgd) != 0) { warnx("couldn't get CGD"); return (-1); } command = (cgd.protocol == PROTO_ATA) ? ATA_ATA_IDENTIFY : ATA_ATAPI_IDENTIFY; retry_command = 0; } else { /* We don't know which for sure so try both */ command = ATA_ATA_IDENTIFY; retry_command = ATA_ATAPI_IDENTIFY; } ptr = (uint16_t *)calloc(1, sizeof(struct ata_params)); if (ptr == NULL) { warnx("can't calloc memory for identify\n"); return (1); } error = ata_do_28bit_cmd(device, ccb, /*retries*/retry_count, /*flags*/CAM_DIR_IN, /*protocol*/AP_PROTO_PIO_IN, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/command, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)ptr, /*dxfer_len*/sizeof(struct ata_params), /*timeout*/timeout ? timeout : 30 * 1000, /*quiet*/1); if (error != 0) { if (retry_command == 0) { free(ptr); return (1); } error = ata_do_28bit_cmd(device, ccb, /*retries*/retry_count, /*flags*/CAM_DIR_IN, /*protocol*/AP_PROTO_PIO_IN, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/retry_command, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)ptr, /*dxfer_len*/sizeof(struct ata_params), /*timeout*/timeout ? timeout : 30 * 1000, /*quiet*/0); if (error != 0) { free(ptr); return (1); } } ident_buf = (struct ata_params *)ptr; ata_param_fixup(ident_buf); error = 1; for (i = 0; i < sizeof(struct ata_params) / 2; i++) { if (ptr[i] != 0) error = 0; } /* check for invalid (all zero) response */ if (error != 0) { warnx("Invalid identify response detected"); free(ptr); return (error); } *ident_bufp = ident_buf; return (0); } static int ataidentify(struct cam_device *device, int retry_count, int timeout) { union ccb *ccb; struct ata_params *ident_buf; u_int64_t hpasize, nativesize; if ((ccb = cam_getccb(device)) == NULL) { warnx("couldn't allocate CCB"); return (1); } if (ata_do_identify(device, retry_count, timeout, ccb, &ident_buf) != 0) { cam_freeccb(ccb); return (1); } if (arglist & CAM_ARG_VERBOSE) { printf("%s%d: Raw identify data:\n", device->device_name, device->dev_unit_num); dump_data((void*)ident_buf, sizeof(struct ata_params)); } if (ident_buf->support.command1 & ATA_SUPPORT_PROTECTED) { if (ata_read_native_max(device, retry_count, timeout, ccb, ident_buf, &hpasize) != 0) { cam_freeccb(ccb); return (1); } } else { hpasize = 0; } if (ident_buf->support2 & ATA_SUPPORT_AMAX_ADDR) { if (ata_get_native_max(device, retry_count, timeout, ccb, &nativesize) != 0) { cam_freeccb(ccb); return (1); } } else { nativesize = 0; } printf("%s%d: ", device->device_name, device->dev_unit_num); ata_print_ident(ident_buf); camxferrate(device); atacapprint(ident_buf); atahpa_print(ident_buf, hpasize, 0); ataama_print(ident_buf, nativesize, 0); free(ident_buf); cam_freeccb(ccb); return (0); } #ifdef WITH_NVME static int nvmeidentify(struct cam_device *device, int retry_count __unused, int timeout __unused) { struct nvme_controller_data cdata; if (nvme_get_cdata(device, &cdata)) return (1); nvme_print_controller(&cdata); return (0); } #endif static int identify(struct cam_device *device, int retry_count, int timeout) { #ifdef WITH_NVME struct ccb_pathinq cpi; if (get_cpi(device, &cpi) != 0) { warnx("couldn't get CPI"); return (-1); } if (cpi.protocol == PROTO_NVME) { return (nvmeidentify(device, retry_count, timeout)); } #endif return (ataidentify(device, retry_count, timeout)); } enum { ATA_SECURITY_ACTION_PRINT, ATA_SECURITY_ACTION_FREEZE, ATA_SECURITY_ACTION_UNLOCK, ATA_SECURITY_ACTION_DISABLE, ATA_SECURITY_ACTION_ERASE, ATA_SECURITY_ACTION_ERASE_ENHANCED, ATA_SECURITY_ACTION_SET_PASSWORD }; static void atasecurity_print_time(u_int16_t tw) { if (tw == 0) printf("unspecified"); else if (tw >= 255) printf("> 508 min"); else printf("%i min", 2 * tw); } static u_int32_t atasecurity_erase_timeout_msecs(u_int16_t timeout) { if (timeout == 0) return 2 * 3600 * 1000; /* default: two hours */ else if (timeout > 255) return (508 + 60) * 60 * 1000; /* spec says > 508 minutes */ return ((2 * timeout) + 5) * 60 * 1000; /* add a 5min margin */ } static void atasecurity_notify(u_int8_t command, struct ata_security_password *pwd) { struct ata_cmd cmd; bzero(&cmd, sizeof(cmd)); cmd.command = command; printf("Issuing %s", ata_op_string(&cmd)); if (pwd != NULL) { char pass[sizeof(pwd->password)+1]; /* pwd->password may not be null terminated */ pass[sizeof(pwd->password)] = '\0'; strncpy(pass, pwd->password, sizeof(pwd->password)); printf(" password='%s', user='%s'", pass, (pwd->ctrl & ATA_SECURITY_PASSWORD_MASTER) ? "master" : "user"); if (command == ATA_SECURITY_SET_PASSWORD) { printf(", mode='%s'", (pwd->ctrl & ATA_SECURITY_LEVEL_MAXIMUM) ? "maximum" : "high"); } } printf("\n"); } static int atasecurity_freeze(struct cam_device *device, union ccb *ccb, int retry_count, u_int32_t timeout, int quiet) { if (quiet == 0) atasecurity_notify(ATA_SECURITY_FREEZE_LOCK, NULL); return ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_FREEZE_LOCK, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/timeout, /*quiet*/0); } static int atasecurity_unlock(struct cam_device *device, union ccb *ccb, int retry_count, u_int32_t timeout, struct ata_security_password *pwd, int quiet) { if (quiet == 0) atasecurity_notify(ATA_SECURITY_UNLOCK, pwd); return ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/AP_PROTO_PIO_OUT, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_UNLOCK, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)pwd, /*dxfer_len*/sizeof(*pwd), /*timeout*/timeout, /*quiet*/0); } static int atasecurity_disable(struct cam_device *device, union ccb *ccb, int retry_count, u_int32_t timeout, struct ata_security_password *pwd, int quiet) { if (quiet == 0) atasecurity_notify(ATA_SECURITY_DISABLE_PASSWORD, pwd); return ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/AP_PROTO_PIO_OUT, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_DISABLE_PASSWORD, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)pwd, /*dxfer_len*/sizeof(*pwd), /*timeout*/timeout, /*quiet*/0); } static int atasecurity_erase_confirm(struct cam_device *device, struct ata_params* ident_buf) { printf("\nYou are about to ERASE ALL DATA from the following" " device:\n%s%d,%s%d: ", device->device_name, device->dev_unit_num, device->given_dev_name, device->given_unit_number); ata_print_ident(ident_buf); for(;;) { char str[50]; printf("\nAre you SURE you want to ERASE ALL DATA? (yes/no) "); if (fgets(str, sizeof(str), stdin) != NULL) { if (strncasecmp(str, "yes", 3) == 0) { return (1); } else if (strncasecmp(str, "no", 2) == 0) { return (0); } else { printf("Please answer \"yes\" or " "\"no\"\n"); } } } /* NOTREACHED */ return (0); } static int atasecurity_erase(struct cam_device *device, union ccb *ccb, int retry_count, u_int32_t timeout, u_int32_t erase_timeout, struct ata_security_password *pwd, int quiet) { int error; if (quiet == 0) atasecurity_notify(ATA_SECURITY_ERASE_PREPARE, NULL); error = ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_ERASE_PREPARE, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/timeout, /*quiet*/0); if (error != 0) return error; if (quiet == 0) atasecurity_notify(ATA_SECURITY_ERASE_UNIT, pwd); error = ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/AP_PROTO_PIO_OUT, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_ERASE_UNIT, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)pwd, /*dxfer_len*/sizeof(*pwd), /*timeout*/erase_timeout, /*quiet*/0); if (error == 0 && quiet == 0) printf("\nErase Complete\n"); return error; } static int atasecurity_set_password(struct cam_device *device, union ccb *ccb, int retry_count, u_int32_t timeout, struct ata_security_password *pwd, int quiet) { if (quiet == 0) atasecurity_notify(ATA_SECURITY_SET_PASSWORD, pwd); return ata_do_28bit_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_OUT, /*protocol*/AP_PROTO_PIO_OUT, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SECURITY_SET_PASSWORD, /*features*/0, /*lba*/0, /*sector_count*/0, /*data_ptr*/(u_int8_t *)pwd, /*dxfer_len*/sizeof(*pwd), /*timeout*/timeout, /*quiet*/0); } static void atasecurity_print(struct ata_params *parm) { printf("\nSecurity Option Value\n"); if (arglist & CAM_ARG_VERBOSE) { printf("status %04x\n", parm->security_status); } printf("supported %s\n", parm->security_status & ATA_SECURITY_SUPPORTED ? "yes" : "no"); if (!(parm->security_status & ATA_SECURITY_SUPPORTED)) return; printf("enabled %s\n", parm->security_status & ATA_SECURITY_ENABLED ? "yes" : "no"); printf("drive locked %s\n", parm->security_status & ATA_SECURITY_LOCKED ? "yes" : "no"); printf("security config frozen %s\n", parm->security_status & ATA_SECURITY_FROZEN ? "yes" : "no"); printf("count expired %s\n", parm->security_status & ATA_SECURITY_COUNT_EXP ? "yes" : "no"); printf("security level %s\n", parm->security_status & ATA_SECURITY_LEVEL ? "maximum" : "high"); printf("enhanced erase supported %s\n", parm->security_status & ATA_SECURITY_ENH_SUPP ? "yes" : "no"); printf("erase time "); atasecurity_print_time(parm->erase_time); printf("\n"); printf("enhanced erase time "); atasecurity_print_time(parm->enhanced_erase_time); printf("\n"); printf("master password rev %04x%s\n", parm->master_passwd_revision, parm->master_passwd_revision == 0x0000 || parm->master_passwd_revision == 0xFFFF ? " (unsupported)" : ""); } /* * Validates and copies the password in optarg to the passed buffer. * If the password in optarg is the same length as the buffer then * the data will still be copied but no null termination will occur. */ static int ata_getpwd(u_int8_t *passwd, int max, char opt) { int len; len = strlen(optarg); if (len > max) { warnx("-%c password is too long", opt); return (1); } else if (len == 0) { warnx("-%c password is missing", opt); return (1); } else if (optarg[0] == '-'){ warnx("-%c password starts with '-' (generic arg?)", opt); return (1); } else if (strlen(passwd) != 0 && strcmp(passwd, optarg) != 0) { warnx("-%c password conflicts with existing password from -%c", opt, pwd_opt); return (1); } /* Callers pass in a buffer which does NOT need to be terminated */ strncpy(passwd, optarg, max); pwd_opt = opt; return (0); } enum { ATA_HPA_ACTION_PRINT, ATA_HPA_ACTION_SET_MAX, ATA_HPA_ACTION_SET_PWD, ATA_HPA_ACTION_LOCK, ATA_HPA_ACTION_UNLOCK, ATA_HPA_ACTION_FREEZE_LOCK }; static int atahpa_set_confirm(struct cam_device *device, struct ata_params* ident_buf, u_int64_t maxsize, int persist) { printf("\nYou are about to configure HPA to limit the user accessible\n" "sectors to %ju %s on the device:\n%s%d,%s%d: ", maxsize, persist ? "persistently" : "temporarily", device->device_name, device->dev_unit_num, device->given_dev_name, device->given_unit_number); ata_print_ident(ident_buf); for(;;) { char str[50]; printf("\nAre you SURE you want to configure HPA? (yes/no) "); if (NULL != fgets(str, sizeof(str), stdin)) { if (0 == strncasecmp(str, "yes", 3)) { return (1); } else if (0 == strncasecmp(str, "no", 2)) { return (0); } else { printf("Please answer \"yes\" or " "\"no\"\n"); } } } /* NOTREACHED */ return (0); } static int atahpa(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt) { union ccb *ccb; struct ata_params *ident_buf; struct ccb_getdev cgd; struct ata_set_max_pwd pwd; int error, confirm, quiet, c, action, actions, persist; int security, is48bit, pwdsize; u_int64_t hpasize, maxsize; actions = 0; confirm = 0; quiet = 0; maxsize = 0; persist = 0; security = 0; memset(&pwd, 0, sizeof(pwd)); /* default action is to print hpa information */ action = ATA_HPA_ACTION_PRINT; pwdsize = sizeof(pwd.password); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c){ case 's': action = ATA_HPA_ACTION_SET_MAX; maxsize = strtoumax(optarg, NULL, 0); actions++; break; case 'p': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); action = ATA_HPA_ACTION_SET_PWD; security = 1; actions++; break; case 'l': action = ATA_HPA_ACTION_LOCK; security = 1; actions++; break; case 'U': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); action = ATA_HPA_ACTION_UNLOCK; security = 1; actions++; break; case 'f': action = ATA_HPA_ACTION_FREEZE_LOCK; security = 1; actions++; break; case 'P': persist = 1; break; case 'y': confirm++; break; case 'q': quiet++; break; } } if (actions > 1) { warnx("too many hpa actions specified"); return (1); } if (get_cgd(device, &cgd) != 0) { warnx("couldn't get CGD"); return (1); } ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); return (1); } error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); if (error != 0) { cam_freeccb(ccb); return (1); } if (quiet == 0) { printf("%s%d: ", device->device_name, device->dev_unit_num); ata_print_ident(ident_buf); camxferrate(device); } if (action == ATA_HPA_ACTION_PRINT) { error = ata_read_native_max(device, retry_count, timeout, ccb, ident_buf, &hpasize); if (error == 0) atahpa_print(ident_buf, hpasize, 1); cam_freeccb(ccb); free(ident_buf); return (error); } if (!(ident_buf->support.command1 & ATA_SUPPORT_PROTECTED)) { warnx("HPA is not supported by this device"); cam_freeccb(ccb); free(ident_buf); return (1); } if (security && !(ident_buf->support.command2 & ATA_SUPPORT_MAXSECURITY)) { warnx("HPA Security is not supported by this device"); cam_freeccb(ccb); free(ident_buf); return (1); } is48bit = ident_buf->support.command2 & ATA_SUPPORT_ADDRESS48; /* * The ATA spec requires: * 1. Read native max addr is called directly before set max addr * 2. Read native max addr is NOT called before any other set max call */ switch(action) { case ATA_HPA_ACTION_SET_MAX: if (confirm == 0 && atahpa_set_confirm(device, ident_buf, maxsize, persist) == 0) { cam_freeccb(ccb); free(ident_buf); return (1); } error = ata_read_native_max(device, retry_count, timeout, ccb, ident_buf, &hpasize); if (error == 0) { error = atahpa_set_max(device, retry_count, timeout, ccb, is48bit, maxsize, persist); if (error == 0 && quiet == 0) { /* redo identify to get new lba values */ error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); atahpa_print(ident_buf, hpasize, 1); /* Hint CAM to reprobe the device. */ reprobe(device); } } break; case ATA_HPA_ACTION_SET_PWD: error = atahpa_password(device, retry_count, timeout, ccb, is48bit, &pwd); if (error == 0 && quiet == 0) printf("HPA password has been set\n"); break; case ATA_HPA_ACTION_LOCK: error = atahpa_lock(device, retry_count, timeout, ccb, is48bit); if (error == 0 && quiet == 0) printf("HPA has been locked\n"); break; case ATA_HPA_ACTION_UNLOCK: error = atahpa_unlock(device, retry_count, timeout, ccb, is48bit, &pwd); if (error == 0 && quiet == 0) printf("HPA has been unlocked\n"); break; case ATA_HPA_ACTION_FREEZE_LOCK: error = atahpa_freeze_lock(device, retry_count, timeout, ccb, is48bit); if (error == 0 && quiet == 0) printf("HPA has been frozen\n"); break; default: errx(1, "Option currently not supported"); } cam_freeccb(ccb); free(ident_buf); return (error); } enum { ATA_AMA_ACTION_PRINT, ATA_AMA_ACTION_SET_MAX, ATA_AMA_ACTION_FREEZE_LOCK }; static int ataama(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt) { union ccb *ccb; struct ata_params *ident_buf; struct ccb_getdev cgd; int error, quiet, c, action, actions; u_int64_t nativesize, maxsize; actions = 0; quiet = 0; maxsize = 0; /* default action is to print AMA information */ action = ATA_AMA_ACTION_PRINT; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c){ case 's': action = ATA_AMA_ACTION_SET_MAX; maxsize = strtoumax(optarg, NULL, 0); actions++; break; case 'f': action = ATA_AMA_ACTION_FREEZE_LOCK; actions++; break; case 'q': quiet++; break; } } if (actions > 1) { warnx("too many AMA actions specified"); return (1); } if (get_cgd(device, &cgd) != 0) { warnx("couldn't get CGD"); return (1); } ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); return (1); } error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); if (error != 0) { cam_freeccb(ccb); return (1); } if (quiet == 0) { printf("%s%d: ", device->device_name, device->dev_unit_num); ata_print_ident(ident_buf); camxferrate(device); } if (action == ATA_AMA_ACTION_PRINT) { error = ata_get_native_max(device, retry_count, timeout, ccb, &nativesize); if (error == 0) ataama_print(ident_buf, nativesize, 1); cam_freeccb(ccb); free(ident_buf); return (error); } if (!(ident_buf->support2 & ATA_SUPPORT_AMAX_ADDR)) { warnx("Accessible Max Address is not supported by this device"); cam_freeccb(ccb); free(ident_buf); return (1); } switch(action) { case ATA_AMA_ACTION_SET_MAX: error = ata_get_native_max(device, retry_count, timeout, ccb, &nativesize); if (error == 0) { error = ataama_set(device, retry_count, timeout, ccb, maxsize); if (error == 0 && quiet == 0) { /* redo identify to get new lba values */ error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); ataama_print(ident_buf, nativesize, 1); /* Hint CAM to reprobe the device. */ reprobe(device); } } break; case ATA_AMA_ACTION_FREEZE_LOCK: error = ataama_freeze(device, retry_count, timeout, ccb); if (error == 0 && quiet == 0) printf("Accessible Max Address has been frozen\n"); break; default: errx(1, "Option currently not supported"); } cam_freeccb(ccb); free(ident_buf); return (error); } static int atasecurity(struct cam_device *device, int retry_count, int timeout, int argc, char **argv, char *combinedopt) { union ccb *ccb; struct ata_params *ident_buf; int error, confirm, quiet, c, action, actions, setpwd; int security_enabled, erase_timeout, pwdsize; struct ata_security_password pwd; actions = 0; setpwd = 0; erase_timeout = 0; confirm = 0; quiet = 0; memset(&pwd, 0, sizeof(pwd)); /* default action is to print security information */ action = ATA_SECURITY_ACTION_PRINT; /* user is master by default as its safer that way */ pwd.ctrl |= ATA_SECURITY_PASSWORD_MASTER; pwdsize = sizeof(pwd.password); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c){ case 'f': action = ATA_SECURITY_ACTION_FREEZE; actions++; break; case 'U': if (strcasecmp(optarg, "user") == 0) { pwd.ctrl |= ATA_SECURITY_PASSWORD_USER; pwd.ctrl &= ~ATA_SECURITY_PASSWORD_MASTER; } else if (strcasecmp(optarg, "master") == 0) { pwd.ctrl |= ATA_SECURITY_PASSWORD_MASTER; pwd.ctrl &= ~ATA_SECURITY_PASSWORD_USER; } else { warnx("-U argument '%s' is invalid (must be " "'user' or 'master')", optarg); return (1); } break; case 'l': if (strcasecmp(optarg, "high") == 0) { pwd.ctrl |= ATA_SECURITY_LEVEL_HIGH; pwd.ctrl &= ~ATA_SECURITY_LEVEL_MAXIMUM; } else if (strcasecmp(optarg, "maximum") == 0) { pwd.ctrl |= ATA_SECURITY_LEVEL_MAXIMUM; pwd.ctrl &= ~ATA_SECURITY_LEVEL_HIGH; } else { warnx("-l argument '%s' is unknown (must be " "'high' or 'maximum')", optarg); return (1); } break; case 'k': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); action = ATA_SECURITY_ACTION_UNLOCK; actions++; break; case 'd': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); action = ATA_SECURITY_ACTION_DISABLE; actions++; break; case 'e': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); action = ATA_SECURITY_ACTION_ERASE; actions++; break; case 'h': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); pwd.ctrl |= ATA_SECURITY_ERASE_ENHANCED; action = ATA_SECURITY_ACTION_ERASE_ENHANCED; actions++; break; case 's': if (ata_getpwd(pwd.password, pwdsize, c) != 0) return (1); setpwd = 1; if (action == ATA_SECURITY_ACTION_PRINT) action = ATA_SECURITY_ACTION_SET_PASSWORD; /* * Don't increment action as this can be combined * with other actions. */ break; case 'y': confirm++; break; case 'q': quiet++; break; case 'T': erase_timeout = atoi(optarg) * 1000; break; } } if (actions > 1) { warnx("too many security actions specified"); return (1); } if ((ccb = cam_getccb(device)) == NULL) { warnx("couldn't allocate CCB"); return (1); } error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); if (error != 0) { cam_freeccb(ccb); return (1); } if (quiet == 0) { printf("%s%d: ", device->device_name, device->dev_unit_num); ata_print_ident(ident_buf); camxferrate(device); } if (action == ATA_SECURITY_ACTION_PRINT) { atasecurity_print(ident_buf); free(ident_buf); cam_freeccb(ccb); return (0); } if ((ident_buf->support.command1 & ATA_SUPPORT_SECURITY) == 0) { warnx("Security not supported"); free(ident_buf); cam_freeccb(ccb); return (1); } /* default timeout 15 seconds the same as linux hdparm */ timeout = timeout ? timeout : 15 * 1000; security_enabled = ident_buf->security_status & ATA_SECURITY_ENABLED; /* first set the password if requested */ if (setpwd == 1) { /* confirm we can erase before setting the password if erasing */ if (confirm == 0 && (action == ATA_SECURITY_ACTION_ERASE_ENHANCED || action == ATA_SECURITY_ACTION_ERASE) && atasecurity_erase_confirm(device, ident_buf) == 0) { cam_freeccb(ccb); free(ident_buf); return (error); } if (pwd.ctrl & ATA_SECURITY_PASSWORD_MASTER) { pwd.revision = ident_buf->master_passwd_revision; if (pwd.revision != 0 && pwd.revision != 0xfff && --pwd.revision == 0) { pwd.revision = 0xfffe; } } error = atasecurity_set_password(device, ccb, retry_count, timeout, &pwd, quiet); if (error != 0) { cam_freeccb(ccb); free(ident_buf); return (error); } security_enabled = 1; } switch(action) { case ATA_SECURITY_ACTION_FREEZE: error = atasecurity_freeze(device, ccb, retry_count, timeout, quiet); break; case ATA_SECURITY_ACTION_UNLOCK: if (security_enabled) { if (ident_buf->security_status & ATA_SECURITY_LOCKED) { error = atasecurity_unlock(device, ccb, retry_count, timeout, &pwd, quiet); } else { warnx("Can't unlock, drive is not locked"); error = 1; } } else { warnx("Can't unlock, security is disabled"); error = 1; } break; case ATA_SECURITY_ACTION_DISABLE: if (security_enabled) { /* First unlock the drive if its locked */ if (ident_buf->security_status & ATA_SECURITY_LOCKED) { error = atasecurity_unlock(device, ccb, retry_count, timeout, &pwd, quiet); } if (error == 0) { error = atasecurity_disable(device, ccb, retry_count, timeout, &pwd, quiet); } } else { warnx("Can't disable security (already disabled)"); error = 1; } break; case ATA_SECURITY_ACTION_ERASE: if (security_enabled) { if (erase_timeout == 0) { erase_timeout = atasecurity_erase_timeout_msecs( ident_buf->erase_time); } error = atasecurity_erase(device, ccb, retry_count, timeout, erase_timeout, &pwd, quiet); } else { warnx("Can't secure erase (security is disabled)"); error = 1; } break; case ATA_SECURITY_ACTION_ERASE_ENHANCED: if (security_enabled) { if (ident_buf->security_status & ATA_SECURITY_ENH_SUPP) { if (erase_timeout == 0) { erase_timeout = atasecurity_erase_timeout_msecs( ident_buf->enhanced_erase_time); } error = atasecurity_erase(device, ccb, retry_count, timeout, erase_timeout, &pwd, quiet); } else { warnx("Enhanced erase is not supported"); error = 1; } } else { warnx("Can't secure erase (enhanced), " "(security is disabled)"); error = 1; } break; } cam_freeccb(ccb); free(ident_buf); return (error); } /* * Convert periph name into a bus, target and lun. * * Returns the number of parsed components, or 0. */ static int parse_btl_name(char *tstr, path_id_t *bus, target_id_t *target, lun_id_t *lun, cam_argmask *arglst) { int fd; union ccb ccb; bzero(&ccb, sizeof(ccb)); ccb.ccb_h.func_code = XPT_GDEVLIST; if (cam_get_device(tstr, ccb.cgdl.periph_name, sizeof(ccb.cgdl.periph_name), &ccb.cgdl.unit_number) == -1) { warnx("%s", cam_errbuf); return (0); } /* * Attempt to get the passthrough device. This ioctl will * fail if the device name is null, if the device doesn't * exist, or if the passthrough driver isn't in the kernel. */ if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) { warn("Unable to open %s", XPT_DEVICE); return (0); } if (ioctl(fd, CAMGETPASSTHRU, &ccb) == -1) { warn("Unable to find bus:target:lun for device %s%d", ccb.cgdl.periph_name, ccb.cgdl.unit_number); close(fd); return (0); } close(fd); if ((ccb.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { const struct cam_status_entry *entry; entry = cam_fetch_status_entry(ccb.ccb_h.status); warnx("Unable to find bus:target_lun for device %s%d, " "CAM status: %s (%#x)", ccb.cgdl.periph_name, ccb.cgdl.unit_number, entry ? entry->status_text : "Unknown", ccb.ccb_h.status); return (0); } /* * The kernel fills in the bus/target/lun. We don't * need the passthrough device name and unit number since * we aren't going to open it. */ *bus = ccb.ccb_h.path_id; *target = ccb.ccb_h.target_id; *lun = ccb.ccb_h.target_lun; *arglst |= CAM_ARG_BUS | CAM_ARG_TARGET | CAM_ARG_LUN; return (3); } /* * Parse out a bus, or a bus, target and lun in the following * format: * bus * bus:target * bus:target:lun * * Returns the number of parsed components, or 0. */ static int parse_btl(char *tstr, path_id_t *bus, target_id_t *target, lun_id_t *lun, cam_argmask *arglst) { char *tmpstr, *end; int convs = 0; *bus = CAM_BUS_WILDCARD; *target = CAM_TARGET_WILDCARD; *lun = CAM_LUN_WILDCARD; while (isspace(*tstr) && (*tstr != '\0')) tstr++; if (strncasecmp(tstr, "all", strlen("all")) == 0) { arglist |= CAM_ARG_BUS; return (1); } if (!isdigit(*tstr)) return (parse_btl_name(tstr, bus, target, lun, arglst)); tmpstr = strsep(&tstr, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *bus = strtol(tmpstr, &end, 0); if (*end != '\0') return (0); *arglst |= CAM_ARG_BUS; convs++; tmpstr = strsep(&tstr, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *target = strtol(tmpstr, &end, 0); if (*end != '\0') return (0); *arglst |= CAM_ARG_TARGET; convs++; tmpstr = strsep(&tstr, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *lun = strtoll(tmpstr, &end, 0); if (*end != '\0') return (0); *arglst |= CAM_ARG_LUN; convs++; } } } return convs; } static int dorescan_or_reset(int argc, char **argv, int rescan) { static const char must[] = "you must specify \"all\", a bus, a bus:target:lun or periph to %s"; int rv, error = 0; path_id_t bus = CAM_BUS_WILDCARD; target_id_t target = CAM_TARGET_WILDCARD; lun_id_t lun = CAM_LUN_WILDCARD; char *tstr; if (argc < 3) { warnx(must, rescan? "rescan" : "reset"); return (1); } tstr = argv[optind]; while (isspace(*tstr) && (*tstr != '\0')) tstr++; if (strncasecmp(tstr, "all", strlen("all")) == 0) arglist |= CAM_ARG_BUS; else { rv = parse_btl(argv[optind], &bus, &target, &lun, &arglist); if (rv != 1 && rv != 3) { warnx(must, rescan ? "rescan" : "reset"); return (1); } } if (arglist & CAM_ARG_LUN) error = scanlun_or_reset_dev(bus, target, lun, rescan); else error = rescan_or_reset_bus(bus, rescan); return (error); } static int rescan_or_reset_bus(path_id_t bus, int rescan) { union ccb *ccb = NULL, *matchccb = NULL; int fd = -1, retval; int bufsize; retval = 0; if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) { warnx("error opening transport layer device %s", XPT_DEVICE); warn("%s", XPT_DEVICE); return (1); } ccb = malloc(sizeof(*ccb)); if (ccb == NULL) { warn("failed to allocate CCB"); retval = 1; goto bailout; } bzero(ccb, sizeof(*ccb)); if (bus != CAM_BUS_WILDCARD) { ccb->ccb_h.func_code = rescan ? XPT_SCAN_BUS : XPT_RESET_BUS; ccb->ccb_h.path_id = bus; ccb->ccb_h.target_id = CAM_TARGET_WILDCARD; ccb->ccb_h.target_lun = CAM_LUN_WILDCARD; ccb->crcn.flags = CAM_FLAG_NONE; /* run this at a low priority */ ccb->ccb_h.pinfo.priority = 5; if (ioctl(fd, CAMIOCOMMAND, ccb) == -1) { warn("CAMIOCOMMAND ioctl failed"); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { fprintf(stdout, "%s of bus %d was successful\n", rescan ? "Re-scan" : "Reset", bus); } else { fprintf(stdout, "%s of bus %d returned error %#x\n", rescan ? "Re-scan" : "Reset", bus, ccb->ccb_h.status & CAM_STATUS_MASK); retval = 1; } goto bailout; } /* * The right way to handle this is to modify the xpt so that it can * handle a wildcarded bus in a rescan or reset CCB. At the moment * that isn't implemented, so instead we enumerate the buses and * send the rescan or reset to those buses in the case where the * given bus is -1 (wildcard). We don't send a rescan or reset * to the xpt bus; sending a rescan to the xpt bus is effectively a * no-op, sending a rescan to the xpt bus would result in a status of * CAM_REQ_INVALID. */ matchccb = malloc(sizeof(*matchccb)); if (matchccb == NULL) { warn("failed to allocate CCB"); retval = 1; goto bailout; } bzero(matchccb, sizeof(*matchccb)); matchccb->ccb_h.func_code = XPT_DEV_MATCH; matchccb->ccb_h.path_id = CAM_BUS_WILDCARD; bufsize = sizeof(struct dev_match_result) * 20; matchccb->cdm.match_buf_len = bufsize; matchccb->cdm.matches=(struct dev_match_result *)malloc(bufsize); if (matchccb->cdm.matches == NULL) { warnx("can't malloc memory for matches"); retval = 1; goto bailout; } matchccb->cdm.num_matches = 0; matchccb->cdm.num_patterns = 1; matchccb->cdm.pattern_buf_len = sizeof(struct dev_match_pattern); matchccb->cdm.patterns = (struct dev_match_pattern *)malloc( matchccb->cdm.pattern_buf_len); if (matchccb->cdm.patterns == NULL) { warnx("can't malloc memory for patterns"); retval = 1; goto bailout; } matchccb->cdm.patterns[0].type = DEV_MATCH_BUS; matchccb->cdm.patterns[0].pattern.bus_pattern.flags = BUS_MATCH_ANY; do { unsigned int i; if (ioctl(fd, CAMIOCOMMAND, matchccb) == -1) { warn("CAMIOCOMMAND ioctl failed"); retval = 1; goto bailout; } if ((matchccb->ccb_h.status != CAM_REQ_CMP) || ((matchccb->cdm.status != CAM_DEV_MATCH_LAST) && (matchccb->cdm.status != CAM_DEV_MATCH_MORE))) { warnx("got CAM error %#x, CDM error %d\n", matchccb->ccb_h.status, matchccb->cdm.status); retval = 1; goto bailout; } for (i = 0; i < matchccb->cdm.num_matches; i++) { struct bus_match_result *bus_result; /* This shouldn't happen. */ if (matchccb->cdm.matches[i].type != DEV_MATCH_BUS) continue; bus_result =&matchccb->cdm.matches[i].result.bus_result; /* * We don't want to rescan or reset the xpt bus. * See above. */ if (bus_result->path_id == CAM_XPT_PATH_ID) continue; ccb->ccb_h.func_code = rescan ? XPT_SCAN_BUS : XPT_RESET_BUS; ccb->ccb_h.path_id = bus_result->path_id; ccb->ccb_h.target_id = CAM_TARGET_WILDCARD; ccb->ccb_h.target_lun = CAM_LUN_WILDCARD; ccb->crcn.flags = CAM_FLAG_NONE; /* run this at a low priority */ ccb->ccb_h.pinfo.priority = 5; if (ioctl(fd, CAMIOCOMMAND, ccb) == -1) { warn("CAMIOCOMMAND ioctl failed"); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK)==CAM_REQ_CMP){ fprintf(stdout, "%s of bus %d was successful\n", rescan? "Re-scan" : "Reset", bus_result->path_id); } else { /* * Don't bail out just yet, maybe the other * rescan or reset commands will complete * successfully. */ fprintf(stderr, "%s of bus %d returned error " "%#x\n", rescan? "Re-scan" : "Reset", bus_result->path_id, ccb->ccb_h.status & CAM_STATUS_MASK); retval = 1; } } } while ((matchccb->ccb_h.status == CAM_REQ_CMP) && (matchccb->cdm.status == CAM_DEV_MATCH_MORE)); bailout: if (fd != -1) close(fd); if (matchccb != NULL) { free(matchccb->cdm.patterns); free(matchccb->cdm.matches); free(matchccb); } free(ccb); return (retval); } static int scanlun_or_reset_dev(path_id_t bus, target_id_t target, lun_id_t lun, int scan) { union ccb ccb; struct cam_device *device; int fd; device = NULL; if (bus == CAM_BUS_WILDCARD) { warnx("invalid bus number %d", bus); return (1); } if (target == CAM_TARGET_WILDCARD) { warnx("invalid target number %d", target); return (1); } if (lun == CAM_LUN_WILDCARD) { warnx("invalid lun number %jx", (uintmax_t)lun); return (1); } fd = -1; bzero(&ccb, sizeof(union ccb)); if (scan) { if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) { warnx("error opening transport layer device %s\n", XPT_DEVICE); warn("%s", XPT_DEVICE); return (1); } } else { device = cam_open_btl(bus, target, lun, O_RDWR, NULL); if (device == NULL) { warnx("%s", cam_errbuf); return (1); } } ccb.ccb_h.func_code = (scan)? XPT_SCAN_LUN : XPT_RESET_DEV; ccb.ccb_h.path_id = bus; ccb.ccb_h.target_id = target; ccb.ccb_h.target_lun = lun; ccb.ccb_h.timeout = 5000; ccb.crcn.flags = CAM_FLAG_NONE; /* run this at a low priority */ ccb.ccb_h.pinfo.priority = 5; if (scan) { if (ioctl(fd, CAMIOCOMMAND, &ccb) < 0) { warn("CAMIOCOMMAND ioctl failed"); close(fd); return (1); } } else { if (cam_send_ccb(device, &ccb) < 0) { warn("error sending XPT_RESET_DEV CCB"); cam_close_device(device); return (1); } } if (scan) close(fd); else cam_close_device(device); /* * An error code of CAM_BDR_SENT is normal for a BDR request. */ if (((ccb.ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) || ((!scan) && ((ccb.ccb_h.status & CAM_STATUS_MASK) == CAM_BDR_SENT))) { fprintf(stdout, "%s of %d:%d:%jx was successful\n", scan? "Re-scan" : "Reset", bus, target, (uintmax_t)lun); return (0); } else { fprintf(stdout, "%s of %d:%d:%jx returned error %#x\n", scan? "Re-scan" : "Reset", bus, target, (uintmax_t)lun, ccb.ccb_h.status & CAM_STATUS_MASK); return (1); } } static struct scsi_nv defect_list_type_map[] = { { "block", SRDD10_BLOCK_FORMAT }, { "extbfi", SRDD10_EXT_BFI_FORMAT }, { "extphys", SRDD10_EXT_PHYS_FORMAT }, { "longblock", SRDD10_LONG_BLOCK_FORMAT }, { "bfi", SRDD10_BYTES_FROM_INDEX_FORMAT }, { "phys", SRDD10_PHYSICAL_SECTOR_FORMAT } }; static int readdefects(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb = NULL; struct scsi_read_defect_data_hdr_10 *hdr10 = NULL; struct scsi_read_defect_data_hdr_12 *hdr12 = NULL; size_t hdr_size = 0, entry_size = 0; int use_12byte = 0; int hex_format = 0; u_int8_t *defect_list = NULL; u_int8_t list_format = 0; int list_type_set = 0; u_int32_t dlist_length = 0; u_int32_t returned_length = 0, valid_len = 0; u_int32_t num_returned = 0, num_valid = 0; u_int32_t max_possible_size = 0, hdr_max = 0; u_int32_t starting_offset = 0; u_int8_t returned_format, returned_type; unsigned int i; int summary = 0, quiet = 0; int c, error = 0; int lists_specified = 0; int get_length = 1, first_pass = 1; int mads = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c){ case 'f': { scsi_nv_status status; int entry_num = 0; status = scsi_get_nv(defect_list_type_map, sizeof(defect_list_type_map) / sizeof(defect_list_type_map[0]), optarg, &entry_num, SCSI_NV_FLAG_IG_CASE); if (status == SCSI_NV_FOUND) { list_format = defect_list_type_map[ entry_num].value; list_type_set = 1; } else { warnx("%s: %s %s option %s", __func__, (status == SCSI_NV_AMBIGUOUS) ? "ambiguous" : "invalid", "defect list type", optarg); error = 1; goto defect_bailout; } break; } case 'G': arglist |= CAM_ARG_GLIST; break; case 'P': arglist |= CAM_ARG_PLIST; break; case 'q': quiet = 1; break; case 's': summary = 1; break; case 'S': { char *endptr; starting_offset = strtoul(optarg, &endptr, 0); if (*endptr != '\0') { error = 1; warnx("invalid starting offset %s", optarg); goto defect_bailout; } break; } case 'X': hex_format = 1; break; default: break; } } if (list_type_set == 0) { error = 1; warnx("no defect list format specified"); goto defect_bailout; } if (arglist & CAM_ARG_PLIST) { list_format |= SRDD10_PLIST; lists_specified++; } if (arglist & CAM_ARG_GLIST) { list_format |= SRDD10_GLIST; lists_specified++; } /* * This implies a summary, and was the previous behavior. */ if (lists_specified == 0) summary = 1; ccb = cam_getccb(device); retry_12byte: /* * We start off asking for just the header to determine how much * defect data is available. Some Hitachi drives return an error * if you ask for more data than the drive has. Once we know the * length, we retry the command with the returned length. */ if (use_12byte == 0) dlist_length = sizeof(*hdr10); else dlist_length = sizeof(*hdr12); retry: if (defect_list != NULL) { free(defect_list); defect_list = NULL; } defect_list = malloc(dlist_length); if (defect_list == NULL) { warnx("can't malloc memory for defect list"); error = 1; goto defect_bailout; } next_batch: bzero(defect_list, dlist_length); /* * cam_getccb() zeros the CCB header only. So we need to zero the * payload portion of the ccb. */ CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_read_defects(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*list_format*/ list_format, /*addr_desc_index*/ starting_offset, /*data_ptr*/ defect_list, /*dxfer_len*/ dlist_length, /*minimum_cmd_size*/ use_12byte ? 12 : 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (cam_send_ccb(device, ccb) < 0) { perror("error reading defect list"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto defect_bailout; } valid_len = ccb->csio.dxfer_len - ccb->csio.resid; if (use_12byte == 0) { hdr10 = (struct scsi_read_defect_data_hdr_10 *)defect_list; hdr_size = sizeof(*hdr10); hdr_max = SRDDH10_MAX_LENGTH; if (valid_len >= hdr_size) { returned_length = scsi_2btoul(hdr10->length); returned_format = hdr10->format; } else { returned_length = 0; returned_format = 0; } } else { hdr12 = (struct scsi_read_defect_data_hdr_12 *)defect_list; hdr_size = sizeof(*hdr12); hdr_max = SRDDH12_MAX_LENGTH; if (valid_len >= hdr_size) { returned_length = scsi_4btoul(hdr12->length); returned_format = hdr12->format; } else { returned_length = 0; returned_format = 0; } } returned_type = returned_format & SRDDH10_DLIST_FORMAT_MASK; switch (returned_type) { case SRDD10_BLOCK_FORMAT: entry_size = sizeof(struct scsi_defect_desc_block); break; case SRDD10_LONG_BLOCK_FORMAT: entry_size = sizeof(struct scsi_defect_desc_long_block); break; case SRDD10_EXT_PHYS_FORMAT: case SRDD10_PHYSICAL_SECTOR_FORMAT: entry_size = sizeof(struct scsi_defect_desc_phys_sector); break; case SRDD10_EXT_BFI_FORMAT: case SRDD10_BYTES_FROM_INDEX_FORMAT: entry_size = sizeof(struct scsi_defect_desc_bytes_from_index); break; default: warnx("Unknown defect format 0x%x\n", returned_type); error = 1; goto defect_bailout; break; } max_possible_size = (hdr_max / entry_size) * entry_size; num_returned = returned_length / entry_size; num_valid = min(returned_length, valid_len - hdr_size); num_valid /= entry_size; if (get_length != 0) { get_length = 0; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR) { struct scsi_sense_data *sense; int error_code, sense_key, asc, ascq; sense = &ccb->csio.sense_data; scsi_extract_sense_len(sense, ccb->csio.sense_len - ccb->csio.sense_resid, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); /* * If the drive is reporting that it just doesn't * support the defect list format, go ahead and use * the length it reported. Otherwise, the length * may not be valid, so use the maximum. */ if ((sense_key == SSD_KEY_RECOVERED_ERROR) && (asc == 0x1c) && (ascq == 0x00) && (returned_length > 0)) { if ((use_12byte == 0) && (returned_length >= max_possible_size)) { get_length = 1; use_12byte = 1; goto retry_12byte; } dlist_length = returned_length + hdr_size; } else if ((sense_key == SSD_KEY_RECOVERED_ERROR) && (asc == 0x1f) && (ascq == 0x00) && (returned_length > 0)) { /* Partial defect list transfer */ /* * Hitachi drives return this error * along with a partial defect list if they * have more defects than the 10 byte * command can support. Retry with the 12 * byte command. */ if (use_12byte == 0) { get_length = 1; use_12byte = 1; goto retry_12byte; } dlist_length = returned_length + hdr_size; } else if ((sense_key == SSD_KEY_ILLEGAL_REQUEST) && (asc == 0x24) && (ascq == 0x00)) { /* Invalid field in CDB */ /* * SBC-3 says that if the drive has more * defects than can be reported with the * 10 byte command, it should return this * error and no data. Retry with the 12 * byte command. */ if (use_12byte == 0) { get_length = 1; use_12byte = 1; goto retry_12byte; } dlist_length = returned_length + hdr_size; } else { /* * If we got a SCSI error and no valid length, * just use the 10 byte maximum. The 12 * byte maximum is too large. */ if (returned_length == 0) dlist_length = SRDD10_MAX_LENGTH; else { if ((use_12byte == 0) && (returned_length >= max_possible_size)) { get_length = 1; use_12byte = 1; goto retry_12byte; } dlist_length = returned_length + hdr_size; } } } else if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP){ error = 1; warnx("Error reading defect header"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto defect_bailout; } else { if ((use_12byte == 0) && (returned_length >= max_possible_size)) { get_length = 1; use_12byte = 1; goto retry_12byte; } dlist_length = returned_length + hdr_size; } if (summary != 0) { fprintf(stdout, "%u", num_returned); if (quiet == 0) { fprintf(stdout, " defect%s", (num_returned != 1) ? "s" : ""); } fprintf(stdout, "\n"); goto defect_bailout; } /* * We always limit the list length to the 10-byte maximum * length (0xffff). The reason is that some controllers * can't handle larger I/Os, and we can transfer the entire * 10 byte list in one shot. For drives that support the 12 * byte read defects command, we'll step through the list * by specifying a starting offset. For drives that don't * support the 12 byte command's starting offset, we'll * just display the first 64K. */ dlist_length = min(dlist_length, SRDD10_MAX_LENGTH); goto retry; } if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR) && (ccb->csio.scsi_status == SCSI_STATUS_CHECK_COND) && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) { struct scsi_sense_data *sense; int error_code, sense_key, asc, ascq; sense = &ccb->csio.sense_data; scsi_extract_sense_len(sense, ccb->csio.sense_len - ccb->csio.sense_resid, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); /* * According to the SCSI spec, if the disk doesn't support * the requested format, it will generally return a sense * key of RECOVERED ERROR, and an additional sense code * of "DEFECT LIST NOT FOUND". HGST drives also return * Primary/Grown defect list not found errors. So just * check for an ASC of 0x1c. */ if ((sense_key == SSD_KEY_RECOVERED_ERROR) && (asc == 0x1c)) { const char *format_str; format_str = scsi_nv_to_str(defect_list_type_map, sizeof(defect_list_type_map) / sizeof(defect_list_type_map[0]), list_format & SRDD10_DLIST_FORMAT_MASK); warnx("requested defect format %s not available", format_str ? format_str : "unknown"); format_str = scsi_nv_to_str(defect_list_type_map, sizeof(defect_list_type_map) / sizeof(defect_list_type_map[0]), returned_type); if (format_str != NULL) { warnx("Device returned %s format", format_str); } else { error = 1; warnx("Device returned unknown defect" " data format %#x", returned_type); goto defect_bailout; } } else { error = 1; warnx("Error returned from read defect data command"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto defect_bailout; } } else if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = 1; warnx("Error returned from read defect data command"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto defect_bailout; } if (first_pass != 0) { fprintf(stderr, "Got %d defect", num_returned); if ((lists_specified == 0) || (num_returned == 0)) { fprintf(stderr, "s.\n"); goto defect_bailout; } else if (num_returned == 1) fprintf(stderr, ":\n"); else fprintf(stderr, "s:\n"); first_pass = 0; } /* * XXX KDM I should probably clean up the printout format for the * disk defects. */ switch (returned_type) { case SRDD10_PHYSICAL_SECTOR_FORMAT: case SRDD10_EXT_PHYS_FORMAT: { struct scsi_defect_desc_phys_sector *dlist; dlist = (struct scsi_defect_desc_phys_sector *) (defect_list + hdr_size); for (i = 0; i < num_valid; i++) { uint32_t sector; sector = scsi_4btoul(dlist[i].sector); if (returned_type == SRDD10_EXT_PHYS_FORMAT) { mads = (sector & SDD_EXT_PHYS_MADS) ? 0 : 1; sector &= ~SDD_EXT_PHYS_FLAG_MASK; } if (hex_format == 0) fprintf(stdout, "%d:%d:%d%s", scsi_3btoul(dlist[i].cylinder), dlist[i].head, scsi_4btoul(dlist[i].sector), mads ? " - " : "\n"); else fprintf(stdout, "0x%x:0x%x:0x%x%s", scsi_3btoul(dlist[i].cylinder), dlist[i].head, scsi_4btoul(dlist[i].sector), mads ? " - " : "\n"); mads = 0; } if (num_valid < num_returned) { starting_offset += num_valid; goto next_batch; } break; } case SRDD10_BYTES_FROM_INDEX_FORMAT: case SRDD10_EXT_BFI_FORMAT: { struct scsi_defect_desc_bytes_from_index *dlist; dlist = (struct scsi_defect_desc_bytes_from_index *) (defect_list + hdr_size); for (i = 0; i < num_valid; i++) { uint32_t bfi; bfi = scsi_4btoul(dlist[i].bytes_from_index); if (returned_type == SRDD10_EXT_BFI_FORMAT) { mads = (bfi & SDD_EXT_BFI_MADS) ? 1 : 0; bfi &= ~SDD_EXT_BFI_FLAG_MASK; } if (hex_format == 0) fprintf(stdout, "%d:%d:%d%s", scsi_3btoul(dlist[i].cylinder), dlist[i].head, scsi_4btoul(dlist[i].bytes_from_index), mads ? " - " : "\n"); else fprintf(stdout, "0x%x:0x%x:0x%x%s", scsi_3btoul(dlist[i].cylinder), dlist[i].head, scsi_4btoul(dlist[i].bytes_from_index), mads ? " - " : "\n"); mads = 0; } if (num_valid < num_returned) { starting_offset += num_valid; goto next_batch; } break; } case SRDDH10_BLOCK_FORMAT: { struct scsi_defect_desc_block *dlist; dlist = (struct scsi_defect_desc_block *) (defect_list + hdr_size); for (i = 0; i < num_valid; i++) { if (hex_format == 0) fprintf(stdout, "%u\n", scsi_4btoul(dlist[i].address)); else fprintf(stdout, "0x%x\n", scsi_4btoul(dlist[i].address)); } if (num_valid < num_returned) { starting_offset += num_valid; goto next_batch; } break; } case SRDD10_LONG_BLOCK_FORMAT: { struct scsi_defect_desc_long_block *dlist; dlist = (struct scsi_defect_desc_long_block *) (defect_list + hdr_size); for (i = 0; i < num_valid; i++) { if (hex_format == 0) fprintf(stdout, "%ju\n", (uintmax_t)scsi_8btou64( dlist[i].address)); else fprintf(stdout, "0x%jx\n", (uintmax_t)scsi_8btou64( dlist[i].address)); } if (num_valid < num_returned) { starting_offset += num_valid; goto next_batch; } break; } default: fprintf(stderr, "Unknown defect format 0x%x\n", returned_type); error = 1; break; } defect_bailout: if (defect_list != NULL) free(defect_list); if (ccb != NULL) cam_freeccb(ccb); return (error); } #if 0 void reassignblocks(struct cam_device *device, u_int32_t *blocks, int num_blocks) { union ccb *ccb; ccb = cam_getccb(device); cam_freeccb(ccb); } #endif void mode_sense(struct cam_device *device, int dbd, int pc, int page, int subpage, int task_attr, int retry_count, int timeout, u_int8_t *data, int datalen) { union ccb *ccb; int retval; ccb = cam_getccb(device); if (ccb == NULL) errx(1, "mode_sense: couldn't allocate CCB"); CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_mode_sense_subpage(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* dbd */ dbd, /* pc */ pc << 6, /* page */ page, /* subpage */ subpage, /* param_buf */ data, /* param_len */ datalen, /* minimum_cmd_size */ 0, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); cam_close_device(device); if (retval < 0) err(1, "error sending mode sense command"); else errx(1, "error sending mode sense command"); } cam_freeccb(ccb); } void mode_select(struct cam_device *device, int save_pages, int task_attr, int retry_count, int timeout, u_int8_t *data, int datalen) { union ccb *ccb; int retval; ccb = cam_getccb(device); if (ccb == NULL) errx(1, "mode_select: couldn't allocate CCB"); CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_mode_select(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* scsi_page_fmt */ 1, /* save_pages */ save_pages, /* param_buf */ data, /* param_len */ datalen, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } cam_freeccb(ccb); cam_close_device(device); if (retval < 0) err(1, "error sending mode select command"); else errx(1, "error sending mode select command"); } cam_freeccb(ccb); } void modepage(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { char *str_subpage; int c, page = -1, subpage = -1, pc = 0; int binary = 0, dbd = 0, edit = 0, list = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'b': binary = 1; break; case 'd': dbd = 1; break; case 'e': edit = 1; break; case 'l': list++; break; case 'm': str_subpage = optarg; strsep(&str_subpage, ","); page = strtol(optarg, NULL, 0); if (str_subpage) subpage = strtol(str_subpage, NULL, 0); else subpage = 0; if (page < 0) errx(1, "invalid mode page %d", page); if (subpage < 0) errx(1, "invalid mode subpage %d", subpage); break; case 'P': pc = strtol(optarg, NULL, 0); if ((pc < 0) || (pc > 3)) errx(1, "invalid page control field %d", pc); break; default: break; } } if (page == -1 && list == 0) errx(1, "you must specify a mode page!"); if (list != 0) { mode_list(device, dbd, pc, list > 1, task_attr, retry_count, timeout); } else { mode_edit(device, dbd, pc, page, subpage, edit, binary, task_attr, retry_count, timeout); } } static int scsicmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb; u_int32_t flags = CAM_DIR_NONE; u_int8_t *data_ptr = NULL; u_int8_t cdb[20]; u_int8_t atacmd[12]; struct get_hook hook; int c, data_bytes = 0, valid_bytes; int cdb_len = 0; int atacmd_len = 0; int dmacmd = 0; int fpdmacmd = 0; int need_res = 0; char *datastr = NULL, *tstr, *resstr = NULL; int error = 0; int fd_data = 0, fd_res = 0; int retval; ccb = cam_getccb(device); if (ccb == NULL) { warnx("scsicmd: error allocating ccb"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(ccb); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'a': tstr = optarg; while (isspace(*tstr) && (*tstr != '\0')) tstr++; hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; atacmd_len = buff_encode_visit(atacmd, sizeof(atacmd), tstr, iget, &hook); /* * Increment optind by the number of arguments the * encoding routine processed. After each call to * getopt(3), optind points to the argument that * getopt should process _next_. In this case, * that means it points to the first command string * argument, if there is one. Once we increment * this, it should point to either the next command * line argument, or it should be past the end of * the list. */ optind += hook.got; break; case 'c': tstr = optarg; while (isspace(*tstr) && (*tstr != '\0')) tstr++; hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; cdb_len = buff_encode_visit(cdb, sizeof(cdb), tstr, iget, &hook); /* * Increment optind by the number of arguments the * encoding routine processed. After each call to * getopt(3), optind points to the argument that * getopt should process _next_. In this case, * that means it points to the first command string * argument, if there is one. Once we increment * this, it should point to either the next command * line argument, or it should be past the end of * the list. */ optind += hook.got; break; case 'd': dmacmd = 1; break; case 'f': fpdmacmd = 1; break; case 'i': if (arglist & CAM_ARG_CMD_OUT) { warnx("command must either be " "read or write, not both"); error = 1; goto scsicmd_bailout; } arglist |= CAM_ARG_CMD_IN; flags = CAM_DIR_IN; data_bytes = strtol(optarg, NULL, 0); if (data_bytes <= 0) { warnx("invalid number of input bytes %d", data_bytes); error = 1; goto scsicmd_bailout; } hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; optind++; datastr = cget(&hook, NULL); /* * If the user supplied "-" instead of a format, he * wants the data to be written to stdout. */ if ((datastr != NULL) && (datastr[0] == '-')) fd_data = 1; data_ptr = (u_int8_t *)malloc(data_bytes); if (data_ptr == NULL) { warnx("can't malloc memory for data_ptr"); error = 1; goto scsicmd_bailout; } break; case 'o': if (arglist & CAM_ARG_CMD_IN) { warnx("command must either be " "read or write, not both"); error = 1; goto scsicmd_bailout; } arglist |= CAM_ARG_CMD_OUT; flags = CAM_DIR_OUT; data_bytes = strtol(optarg, NULL, 0); if (data_bytes <= 0) { warnx("invalid number of output bytes %d", data_bytes); error = 1; goto scsicmd_bailout; } hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; datastr = cget(&hook, NULL); data_ptr = (u_int8_t *)malloc(data_bytes); if (data_ptr == NULL) { warnx("can't malloc memory for data_ptr"); error = 1; goto scsicmd_bailout; } bzero(data_ptr, data_bytes); /* * If the user supplied "-" instead of a format, he * wants the data to be read from stdin. */ if ((datastr != NULL) && (datastr[0] == '-')) fd_data = 1; else buff_encode_visit(data_ptr, data_bytes, datastr, iget, &hook); optind += hook.got; break; case 'r': need_res = 1; hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; resstr = cget(&hook, NULL); if ((resstr != NULL) && (resstr[0] == '-')) fd_res = 1; optind += hook.got; break; default: break; } } /* * If fd_data is set, and we're writing to the device, we need to * read the data the user wants written from stdin. */ if ((fd_data == 1) && (arglist & CAM_ARG_CMD_OUT)) { ssize_t amt_read; int amt_to_read = data_bytes; u_int8_t *buf_ptr = data_ptr; for (amt_read = 0; amt_to_read > 0; amt_read = read(STDIN_FILENO, buf_ptr, amt_to_read)) { if (amt_read == -1) { warn("error reading data from stdin"); error = 1; goto scsicmd_bailout; } amt_to_read -= amt_read; buf_ptr += amt_read; } } if (arglist & CAM_ARG_ERR_RECOVER) flags |= CAM_PASS_ERR_RECOVER; /* Disable freezing the device queue */ flags |= CAM_DEV_QFRZDIS; if (cdb_len) { /* * This is taken from the SCSI-3 draft spec. * (T10/1157D revision 0.3) * The top 3 bits of an opcode are the group code. * The next 5 bits are the command code. * Group 0: six byte commands * Group 1: ten byte commands * Group 2: ten byte commands * Group 3: reserved * Group 4: sixteen byte commands * Group 5: twelve byte commands * Group 6: vendor specific * Group 7: vendor specific */ switch((cdb[0] >> 5) & 0x7) { case 0: cdb_len = 6; break; case 1: case 2: cdb_len = 10; break; case 3: case 6: case 7: /* computed by buff_encode_visit */ break; case 4: cdb_len = 16; break; case 5: cdb_len = 12; break; } /* * We should probably use csio_build_visit or something like that * here, but it's easier to encode arguments as you go. The * alternative would be skipping the CDB argument and then encoding * it here, since we've got the data buffer argument by now. */ bcopy(cdb, &ccb->csio.cdb_io.cdb_bytes, cdb_len); cam_fill_csio(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*tag_action*/ task_attr, /*data_ptr*/ data_ptr, /*dxfer_len*/ data_bytes, /*sense_len*/ SSD_FULL_SIZE, /*cdb_len*/ cdb_len, /*timeout*/ timeout ? timeout : 5000); } else { atacmd_len = 12; bcopy(atacmd, &ccb->ataio.cmd.command, atacmd_len); if (need_res) ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT; if (dmacmd) ccb->ataio.cmd.flags |= CAM_ATAIO_DMA; if (fpdmacmd) ccb->ataio.cmd.flags |= CAM_ATAIO_FPDMA; cam_fill_ataio(&ccb->ataio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*tag_action*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ data_bytes, /*timeout*/ timeout ? timeout : 5000); } if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto scsicmd_bailout; } if (atacmd_len && need_res) { if (fd_res == 0) { buff_decode_visit(&ccb->ataio.res.status, 11, resstr, arg_put, NULL); fprintf(stdout, "\n"); } else { fprintf(stdout, "%02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n", ccb->ataio.res.status, ccb->ataio.res.error, ccb->ataio.res.lba_low, ccb->ataio.res.lba_mid, ccb->ataio.res.lba_high, ccb->ataio.res.device, ccb->ataio.res.lba_low_exp, ccb->ataio.res.lba_mid_exp, ccb->ataio.res.lba_high_exp, ccb->ataio.res.sector_count, ccb->ataio.res.sector_count_exp); fflush(stdout); } } if (cdb_len) valid_bytes = ccb->csio.dxfer_len - ccb->csio.resid; else valid_bytes = ccb->ataio.dxfer_len - ccb->ataio.resid; if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) && (arglist & CAM_ARG_CMD_IN) && (valid_bytes > 0)) { if (fd_data == 0) { buff_decode_visit(data_ptr, valid_bytes, datastr, arg_put, NULL); fprintf(stdout, "\n"); } else { ssize_t amt_written; int amt_to_write = valid_bytes; u_int8_t *buf_ptr = data_ptr; for (amt_written = 0; (amt_to_write > 0) && (amt_written =write(1, buf_ptr,amt_to_write))> 0;){ amt_to_write -= amt_written; buf_ptr += amt_written; } if (amt_written == -1) { warn("error writing data to stdout"); error = 1; goto scsicmd_bailout; } else if ((amt_written == 0) && (amt_to_write > 0)) { warnx("only wrote %u bytes out of %u", valid_bytes - amt_to_write, valid_bytes); } } } scsicmd_bailout: if ((data_bytes > 0) && (data_ptr != NULL)) free(data_ptr); cam_freeccb(ccb); return (error); } static int camdebug(int argc, char **argv, char *combinedopt) { int c, fd; path_id_t bus = CAM_BUS_WILDCARD; target_id_t target = CAM_TARGET_WILDCARD; lun_id_t lun = CAM_LUN_WILDCARD; char *tstr; union ccb ccb; int error = 0, rv; bzero(&ccb, sizeof(union ccb)); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'I': arglist |= CAM_ARG_DEBUG_INFO; ccb.cdbg.flags |= CAM_DEBUG_INFO; break; case 'P': arglist |= CAM_ARG_DEBUG_PERIPH; ccb.cdbg.flags |= CAM_DEBUG_PERIPH; break; case 'S': arglist |= CAM_ARG_DEBUG_SUBTRACE; ccb.cdbg.flags |= CAM_DEBUG_SUBTRACE; break; case 'T': arglist |= CAM_ARG_DEBUG_TRACE; ccb.cdbg.flags |= CAM_DEBUG_TRACE; break; case 'X': arglist |= CAM_ARG_DEBUG_XPT; ccb.cdbg.flags |= CAM_DEBUG_XPT; break; case 'c': arglist |= CAM_ARG_DEBUG_CDB; ccb.cdbg.flags |= CAM_DEBUG_CDB; break; case 'p': arglist |= CAM_ARG_DEBUG_PROBE; ccb.cdbg.flags |= CAM_DEBUG_PROBE; break; default: break; } } argc -= optind; argv += optind; if (argc <= 0) { warnx("you must specify \"off\", \"all\" or a bus,"); warnx("bus:target, bus:target:lun or periph"); return (1); } tstr = *argv; while (isspace(*tstr) && (*tstr != '\0')) tstr++; if (strncmp(tstr, "off", 3) == 0) { ccb.cdbg.flags = CAM_DEBUG_NONE; arglist &= ~(CAM_ARG_DEBUG_INFO|CAM_ARG_DEBUG_PERIPH| CAM_ARG_DEBUG_TRACE|CAM_ARG_DEBUG_SUBTRACE| CAM_ARG_DEBUG_XPT|CAM_ARG_DEBUG_PROBE); } else { rv = parse_btl(tstr, &bus, &target, &lun, &arglist); if (rv < 1) { warnx("you must specify \"all\", \"off\", or a bus,"); warnx("bus:target, bus:target:lun or periph to debug"); return (1); } } if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) { warnx("error opening transport layer device %s", XPT_DEVICE); warn("%s", XPT_DEVICE); return (1); } ccb.ccb_h.func_code = XPT_DEBUG; ccb.ccb_h.path_id = bus; ccb.ccb_h.target_id = target; ccb.ccb_h.target_lun = lun; if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) { warn("CAMIOCOMMAND ioctl failed"); error = 1; } else { if ((ccb.ccb_h.status & CAM_STATUS_MASK) == CAM_FUNC_NOTAVAIL) { warnx("CAM debugging not available"); warnx("you need to put options CAMDEBUG in" " your kernel config file!"); error = 1; } else if ((ccb.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_DEBUG CCB failed with status %#x", ccb.ccb_h.status); error = 1; } else { if (ccb.cdbg.flags == CAM_DEBUG_NONE) { fprintf(stderr, "Debugging turned off\n"); } else { fprintf(stderr, "Debugging enabled for " "%d:%d:%jx\n", bus, target, (uintmax_t)lun); } } } close(fd); return (error); } static int tagcontrol(struct cam_device *device, int argc, char **argv, char *combinedopt) { int c; union ccb *ccb; int numtags = -1; int retval = 0; int quiet = 0; char pathstr[1024]; ccb = cam_getccb(device); if (ccb == NULL) { warnx("tagcontrol: error allocating ccb"); return (1); } while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'N': numtags = strtol(optarg, NULL, 0); if (numtags < 0) { warnx("tag count %d is < 0", numtags); retval = 1; goto tagcontrol_bailout; } break; case 'q': quiet++; break; default: break; } } cam_path_string(device, pathstr, sizeof(pathstr)); if (numtags >= 0) { CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->crs); ccb->ccb_h.func_code = XPT_REL_SIMQ; ccb->ccb_h.flags = CAM_DEV_QFREEZE; ccb->crs.release_flags = RELSIM_ADJUST_OPENINGS; ccb->crs.openings = numtags; if (cam_send_ccb(device, ccb) < 0) { perror("error sending XPT_REL_SIMQ CCB"); retval = 1; goto tagcontrol_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_REL_SIMQ CCB failed"); cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto tagcontrol_bailout; } if (quiet == 0) fprintf(stdout, "%stagged openings now %d\n", pathstr, ccb->crs.openings); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgds); ccb->ccb_h.func_code = XPT_GDEV_STATS; if (cam_send_ccb(device, ccb) < 0) { perror("error sending XPT_GDEV_STATS CCB"); retval = 1; goto tagcontrol_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_GDEV_STATS CCB failed"); cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto tagcontrol_bailout; } if (arglist & CAM_ARG_VERBOSE) { fprintf(stdout, "%s", pathstr); fprintf(stdout, "dev_openings %d\n", ccb->cgds.dev_openings); fprintf(stdout, "%s", pathstr); fprintf(stdout, "dev_active %d\n", ccb->cgds.dev_active); fprintf(stdout, "%s", pathstr); fprintf(stdout, "allocated %d\n", ccb->cgds.allocated); fprintf(stdout, "%s", pathstr); fprintf(stdout, "queued %d\n", ccb->cgds.queued); fprintf(stdout, "%s", pathstr); fprintf(stdout, "held %d\n", ccb->cgds.held); fprintf(stdout, "%s", pathstr); fprintf(stdout, "mintags %d\n", ccb->cgds.mintags); fprintf(stdout, "%s", pathstr); fprintf(stdout, "maxtags %d\n", ccb->cgds.maxtags); } else { if (quiet == 0) { fprintf(stdout, "%s", pathstr); fprintf(stdout, "device openings: "); } fprintf(stdout, "%d\n", ccb->cgds.dev_openings + ccb->cgds.dev_active); } tagcontrol_bailout: cam_freeccb(ccb); return (retval); } static void cts_print(struct cam_device *device, struct ccb_trans_settings *cts) { char pathstr[1024]; cam_path_string(device, pathstr, sizeof(pathstr)); if (cts->transport == XPORT_SPI) { struct ccb_trans_settings_spi *spi = &cts->xport_specific.spi; if ((spi->valid & CTS_SPI_VALID_SYNC_RATE) != 0) { fprintf(stdout, "%ssync parameter: %d\n", pathstr, spi->sync_period); if (spi->sync_offset != 0) { u_int freq; freq = scsi_calc_syncsrate(spi->sync_period); fprintf(stdout, "%sfrequency: %d.%03dMHz\n", pathstr, freq / 1000, freq % 1000); } } if (spi->valid & CTS_SPI_VALID_SYNC_OFFSET) { fprintf(stdout, "%soffset: %d\n", pathstr, spi->sync_offset); } if (spi->valid & CTS_SPI_VALID_BUS_WIDTH) { fprintf(stdout, "%sbus width: %d bits\n", pathstr, (0x01 << spi->bus_width) * 8); } if (spi->valid & CTS_SPI_VALID_DISC) { fprintf(stdout, "%sdisconnection is %s\n", pathstr, (spi->flags & CTS_SPI_FLAGS_DISC_ENB) ? "enabled" : "disabled"); } } if (cts->transport == XPORT_FC) { struct ccb_trans_settings_fc *fc = &cts->xport_specific.fc; if (fc->valid & CTS_FC_VALID_WWNN) fprintf(stdout, "%sWWNN: 0x%llx\n", pathstr, (long long) fc->wwnn); if (fc->valid & CTS_FC_VALID_WWPN) fprintf(stdout, "%sWWPN: 0x%llx\n", pathstr, (long long) fc->wwpn); if (fc->valid & CTS_FC_VALID_PORT) fprintf(stdout, "%sPortID: 0x%x\n", pathstr, fc->port); if (fc->valid & CTS_FC_VALID_SPEED) fprintf(stdout, "%stransfer speed: %d.%03dMB/s\n", pathstr, fc->bitrate / 1000, fc->bitrate % 1000); } if (cts->transport == XPORT_SAS) { struct ccb_trans_settings_sas *sas = &cts->xport_specific.sas; if (sas->valid & CTS_SAS_VALID_SPEED) fprintf(stdout, "%stransfer speed: %d.%03dMB/s\n", pathstr, sas->bitrate / 1000, sas->bitrate % 1000); } if (cts->transport == XPORT_ATA) { struct ccb_trans_settings_pata *pata = &cts->xport_specific.ata; if ((pata->valid & CTS_ATA_VALID_MODE) != 0) { fprintf(stdout, "%sATA mode: %s\n", pathstr, ata_mode2string(pata->mode)); } if ((pata->valid & CTS_ATA_VALID_ATAPI) != 0) { fprintf(stdout, "%sATAPI packet length: %d\n", pathstr, pata->atapi); } if ((pata->valid & CTS_ATA_VALID_BYTECOUNT) != 0) { fprintf(stdout, "%sPIO transaction length: %d\n", pathstr, pata->bytecount); } } if (cts->transport == XPORT_SATA) { struct ccb_trans_settings_sata *sata = &cts->xport_specific.sata; if ((sata->valid & CTS_SATA_VALID_REVISION) != 0) { fprintf(stdout, "%sSATA revision: %d.x\n", pathstr, sata->revision); } if ((sata->valid & CTS_SATA_VALID_MODE) != 0) { fprintf(stdout, "%sATA mode: %s\n", pathstr, ata_mode2string(sata->mode)); } if ((sata->valid & CTS_SATA_VALID_ATAPI) != 0) { fprintf(stdout, "%sATAPI packet length: %d\n", pathstr, sata->atapi); } if ((sata->valid & CTS_SATA_VALID_BYTECOUNT) != 0) { fprintf(stdout, "%sPIO transaction length: %d\n", pathstr, sata->bytecount); } if ((sata->valid & CTS_SATA_VALID_PM) != 0) { fprintf(stdout, "%sPMP presence: %d\n", pathstr, sata->pm_present); } if ((sata->valid & CTS_SATA_VALID_TAGS) != 0) { fprintf(stdout, "%sNumber of tags: %d\n", pathstr, sata->tags); } if ((sata->valid & CTS_SATA_VALID_CAPS) != 0) { fprintf(stdout, "%sSATA capabilities: %08x\n", pathstr, sata->caps); } } if (cts->protocol == PROTO_ATA) { struct ccb_trans_settings_ata *ata= &cts->proto_specific.ata; if (ata->valid & CTS_ATA_VALID_TQ) { fprintf(stdout, "%stagged queueing: %s\n", pathstr, (ata->flags & CTS_ATA_FLAGS_TAG_ENB) ? "enabled" : "disabled"); } } if (cts->protocol == PROTO_SCSI) { struct ccb_trans_settings_scsi *scsi= &cts->proto_specific.scsi; if (scsi->valid & CTS_SCSI_VALID_TQ) { fprintf(stdout, "%stagged queueing: %s\n", pathstr, (scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) ? "enabled" : "disabled"); } } #ifdef WITH_NVME if (cts->protocol == PROTO_NVME) { struct ccb_trans_settings_nvme *nvmex = &cts->xport_specific.nvme; if (nvmex->valid & CTS_NVME_VALID_SPEC) { fprintf(stdout, "%sNVMe Spec: %d.%d\n", pathstr, NVME_MAJOR(nvmex->spec), NVME_MINOR(nvmex->spec)); } if (nvmex->valid & CTS_NVME_VALID_LINK) { fprintf(stdout, "%sPCIe lanes: %d (%d max)\n", pathstr, nvmex->lanes, nvmex->max_lanes); fprintf(stdout, "%sPCIe Generation: %d (%d max)\n", pathstr, nvmex->speed, nvmex->max_speed); } } #endif } /* * Get a path inquiry CCB for the specified device. */ static int get_cpi(struct cam_device *device, struct ccb_pathinq *cpi) { union ccb *ccb; int retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("get_cpi: couldn't allocate CCB"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi); ccb->ccb_h.func_code = XPT_PATH_INQ; if (cam_send_ccb(device, ccb) < 0) { warn("get_cpi: error sending Path Inquiry CCB"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_cpi_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_cpi_bailout; } bcopy(&ccb->cpi, cpi, sizeof(struct ccb_pathinq)); get_cpi_bailout: cam_freeccb(ccb); return (retval); } /* * Get a get device CCB for the specified device. */ static int get_cgd(struct cam_device *device, struct ccb_getdev *cgd) { union ccb *ccb; int retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("get_cgd: couldn't allocate CCB"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgd); ccb->ccb_h.func_code = XPT_GDEV_TYPE; if (cam_send_ccb(device, ccb) < 0) { warn("get_cgd: error sending Path Inquiry CCB"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_cgd_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_cgd_bailout; } bcopy(&ccb->cgd, cgd, sizeof(struct ccb_getdev)); get_cgd_bailout: cam_freeccb(ccb); return (retval); } /* * Returns 1 if the device has the VPD page, 0 if it does not, and -1 on an * error. */ int dev_has_vpd_page(struct cam_device *dev, uint8_t page_id, int retry_count, int timeout, int verbosemode) { union ccb *ccb = NULL; struct scsi_vpd_supported_page_list sup_pages; int i; int retval = 0; ccb = cam_getccb(dev); if (ccb == NULL) { warn("Unable to allocate CCB"); retval = -1; goto bailout; } /* cam_getccb cleans up the header, caller has to zero the payload */ CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); bzero(&sup_pages, sizeof(sup_pages)); scsi_inquiry(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /* tag_action */ MSG_SIMPLE_Q_TAG, /* inq_buf */ (u_int8_t *)&sup_pages, /* inq_len */ sizeof(sup_pages), /* evpd */ 1, /* page_code */ SVPD_SUPPORTED_PAGE_LIST, /* sense_len */ SSD_FULL_SIZE, /* timeout */ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count != 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(dev, ccb) < 0) { cam_freeccb(ccb); ccb = NULL; retval = -1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (verbosemode != 0) cam_error_print(dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = -1; goto bailout; } for (i = 0; i < sup_pages.length; i++) { if (sup_pages.list[i] == page_id) { retval = 1; goto bailout; } } bailout: if (ccb != NULL) cam_freeccb(ccb); return (retval); } /* * devtype is filled in with the type of device. * Returns 0 for success, non-zero for failure. */ int get_device_type(struct cam_device *dev, int retry_count, int timeout, int verbosemode, camcontrol_devtype *devtype) { struct ccb_getdev cgd; int retval; retval = get_cgd(dev, &cgd); if (retval != 0) goto bailout; switch (cgd.protocol) { case PROTO_SCSI: break; case PROTO_ATA: case PROTO_ATAPI: case PROTO_SATAPM: *devtype = CC_DT_ATA; goto bailout; break; /*NOTREACHED*/ case PROTO_NVME: *devtype = CC_DT_NVME; goto bailout; break; /*NOTREACHED*/ case PROTO_MMCSD: *devtype = CC_DT_MMCSD; goto bailout; break; /*NOTREACHED*/ default: *devtype = CC_DT_UNKNOWN; goto bailout; break; /*NOTREACHED*/ } if (retry_count == -1) { /* * For a retry count of -1, used only the cached data to avoid * I/O to the drive. Sending the identify command to the drive * can cause issues for SATL attachaed drives since identify is * not an NCQ command. */ if (cgd.ident_data.config != 0) *devtype = CC_DT_SATL; else *devtype = CC_DT_SCSI; } else { /* * Check for the ATA Information VPD page (0x89). If this is an * ATA device behind a SCSI to ATA translation layer (SATL), * this VPD page should be present. * * If that VPD page isn't present, or we get an error back from * the INQUIRY command, we'll just treat it as a normal SCSI * device. */ retval = dev_has_vpd_page(dev, SVPD_ATA_INFORMATION, retry_count, timeout, verbosemode); if (retval == 1) *devtype = CC_DT_SATL; else *devtype = CC_DT_SCSI; } retval = 0; bailout: return (retval); } int build_ata_cmd(union ccb *ccb, uint32_t retry_count, uint32_t flags, uint8_t tag_action, uint8_t protocol, uint8_t ata_flags, uint16_t features, uint16_t sector_count, uint64_t lba, uint8_t command, uint32_t auxiliary, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout, int is48bit, camcontrol_devtype devtype) { int retval = 0; if (devtype == CC_DT_ATA) { cam_fill_ataio(&ccb->ataio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*tag_action*/ tag_action, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*timeout*/ timeout); if (is48bit || lba > ATA_MAX_28BIT_LBA) ata_48bit_cmd(&ccb->ataio, command, features, lba, sector_count); else ata_28bit_cmd(&ccb->ataio, command, features, lba, sector_count); if (auxiliary != 0) { ccb->ataio.ata_flags |= ATA_FLAG_AUX; ccb->ataio.aux = auxiliary; } if (ata_flags & AP_FLAG_CHK_COND) ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT; if ((protocol & AP_PROTO_MASK) == AP_PROTO_DMA) ccb->ataio.cmd.flags |= CAM_ATAIO_DMA; else if ((protocol & AP_PROTO_MASK) == AP_PROTO_FPDMA) ccb->ataio.cmd.flags |= CAM_ATAIO_FPDMA; } else { if (is48bit || lba > ATA_MAX_28BIT_LBA) protocol |= AP_EXTEND; retval = scsi_ata_pass(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*tag_action*/ tag_action, /*protocol*/ protocol, /*ata_flags*/ ata_flags, /*features*/ features, /*sector_count*/ sector_count, /*lba*/ lba, /*command*/ command, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ auxiliary, /*control*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*cdb_storage*/ cdb_storage, /*cdb_storage_len*/ cdb_storage_len, /*minimum_cmd_size*/ 0, /*sense_len*/ sense_len, /*timeout*/ timeout); } return (retval); } int get_ata_status(struct cam_device *dev, union ccb *ccb, uint8_t *error, uint16_t *count, uint64_t *lba, uint8_t *device, uint8_t *status) { int retval = 0; switch (ccb->ccb_h.func_code) { case XPT_SCSI_IO: { uint8_t opcode; int error_code = 0, sense_key = 0, asc = 0, ascq = 0; /* * In this case, we have SCSI ATA PASS-THROUGH command, 12 * or 16 byte, and need to see what */ if (ccb->ccb_h.flags & CAM_CDB_POINTER) opcode = ccb->csio.cdb_io.cdb_ptr[0]; else opcode = ccb->csio.cdb_io.cdb_bytes[0]; if ((opcode != ATA_PASS_12) && (opcode != ATA_PASS_16)) { retval = 1; warnx("%s: unsupported opcode %02x", __func__, opcode); goto bailout; } retval = scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq); /* Note: the _ccb() variant returns 0 for an error */ if (retval == 0) { retval = 1; goto bailout; } else retval = 0; switch (error_code) { case SSD_DESC_CURRENT_ERROR: case SSD_DESC_DEFERRED_ERROR: { struct scsi_sense_data_desc *sense; struct scsi_sense_ata_ret_desc *desc; uint8_t *desc_ptr; sense = (struct scsi_sense_data_desc *) &ccb->csio.sense_data; desc_ptr = scsi_find_desc(sense, ccb->csio.sense_len - ccb->csio.sense_resid, SSD_DESC_ATA); if (desc_ptr == NULL) { cam_error_print(dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } desc = (struct scsi_sense_ata_ret_desc *)desc_ptr; *error = desc->error; *count = (desc->count_15_8 << 8) | desc->count_7_0; *lba = ((uint64_t)desc->lba_47_40 << 40) | ((uint64_t)desc->lba_39_32 << 32) | ((uint64_t)desc->lba_31_24 << 24) | (desc->lba_23_16 << 16) | (desc->lba_15_8 << 8) | desc->lba_7_0; *device = desc->device; *status = desc->status; /* * If the extend bit isn't set, the result is for a * 12-byte ATA PASS-THROUGH command or a 16 or 32 byte * command without the extend bit set. This means * that the device is supposed to return 28-bit * status. The count field is only 8 bits, and the * LBA field is only 8 bits. */ if ((desc->flags & SSD_DESC_ATA_FLAG_EXTEND) == 0){ *count &= 0xff; *lba &= 0x0fffffff; } break; } case SSD_CURRENT_ERROR: case SSD_DEFERRED_ERROR: { #if 0 struct scsi_sense_data_fixed *sense; #endif /* * XXX KDM need to support fixed sense data. */ warnx("%s: Fixed sense data not supported yet", __func__); retval = 1; goto bailout; break; /*NOTREACHED*/ } default: retval = 1; goto bailout; break; } break; } case XPT_ATA_IO: { struct ata_res *res; /* * In this case, we have an ATA command, and we need to * fill in the requested values from the result register * set. */ res = &ccb->ataio.res; *error = res->error; *status = res->status; *device = res->device; *count = res->sector_count; *lba = (res->lba_high << 16) | (res->lba_mid << 8) | (res->lba_low); if (res->flags & CAM_ATAIO_48BIT) { *count |= (res->sector_count_exp << 8); *lba |= ((uint64_t)res->lba_low_exp << 24) | ((uint64_t)res->lba_mid_exp << 32) | ((uint64_t)res->lba_high_exp << 40); } else { *lba |= (res->device & 0xf) << 24; } break; } default: retval = 1; break; } bailout: return (retval); } static void cpi_print(struct ccb_pathinq *cpi) { char adapter_str[1024]; uint64_t i; snprintf(adapter_str, sizeof(adapter_str), "%s%d:", cpi->dev_name, cpi->unit_number); fprintf(stdout, "%s SIM/HBA version: %d\n", adapter_str, cpi->version_num); for (i = 1; i < UINT8_MAX; i = i << 1) { const char *str; if ((i & cpi->hba_inquiry) == 0) continue; fprintf(stdout, "%s supports ", adapter_str); switch(i) { case PI_MDP_ABLE: str = "MDP message"; break; case PI_WIDE_32: str = "32 bit wide SCSI"; break; case PI_WIDE_16: str = "16 bit wide SCSI"; break; case PI_SDTR_ABLE: str = "SDTR message"; break; case PI_LINKED_CDB: str = "linked CDBs"; break; case PI_TAG_ABLE: str = "tag queue messages"; break; case PI_SOFT_RST: str = "soft reset alternative"; break; case PI_SATAPM: str = "SATA Port Multiplier"; break; default: str = "unknown PI bit set"; break; } fprintf(stdout, "%s\n", str); } for (i = 1; i < UINT32_MAX; i = i << 1) { const char *str; if ((i & cpi->hba_misc) == 0) continue; fprintf(stdout, "%s ", adapter_str); switch(i) { case PIM_ATA_EXT: str = "can understand ata_ext requests"; break; case PIM_EXTLUNS: str = "64bit extended LUNs supported"; break; case PIM_SCANHILO: str = "bus scans from high ID to low ID"; break; case PIM_NOREMOVE: str = "removable devices not included in scan"; break; case PIM_NOINITIATOR: str = "initiator role not supported"; break; case PIM_NOBUSRESET: str = "user has disabled initial BUS RESET or" " controller is in target/mixed mode"; break; case PIM_NO_6_BYTE: str = "do not send 6-byte commands"; break; case PIM_SEQSCAN: str = "scan bus sequentially"; break; case PIM_UNMAPPED: str = "unmapped I/O supported"; break; case PIM_NOSCAN: str = "does its own scanning"; break; default: str = "unknown PIM bit set"; break; } fprintf(stdout, "%s\n", str); } for (i = 1; i < UINT16_MAX; i = i << 1) { const char *str; if ((i & cpi->target_sprt) == 0) continue; fprintf(stdout, "%s supports ", adapter_str); switch(i) { case PIT_PROCESSOR: str = "target mode processor mode"; break; case PIT_PHASE: str = "target mode phase cog. mode"; break; case PIT_DISCONNECT: str = "disconnects in target mode"; break; case PIT_TERM_IO: str = "terminate I/O message in target mode"; break; case PIT_GRP_6: str = "group 6 commands in target mode"; break; case PIT_GRP_7: str = "group 7 commands in target mode"; break; default: str = "unknown PIT bit set"; break; } fprintf(stdout, "%s\n", str); } fprintf(stdout, "%s HBA engine count: %d\n", adapter_str, cpi->hba_eng_cnt); fprintf(stdout, "%s maximum target: %d\n", adapter_str, cpi->max_target); fprintf(stdout, "%s maximum LUN: %d\n", adapter_str, cpi->max_lun); fprintf(stdout, "%s highest path ID in subsystem: %d\n", adapter_str, cpi->hpath_id); fprintf(stdout, "%s initiator ID: %d\n", adapter_str, cpi->initiator_id); fprintf(stdout, "%s SIM vendor: %s\n", adapter_str, cpi->sim_vid); fprintf(stdout, "%s HBA vendor: %s\n", adapter_str, cpi->hba_vid); fprintf(stdout, "%s HBA vendor ID: 0x%04x\n", adapter_str, cpi->hba_vendor); fprintf(stdout, "%s HBA device ID: 0x%04x\n", adapter_str, cpi->hba_device); fprintf(stdout, "%s HBA subvendor ID: 0x%04x\n", adapter_str, cpi->hba_subvendor); fprintf(stdout, "%s HBA subdevice ID: 0x%04x\n", adapter_str, cpi->hba_subdevice); fprintf(stdout, "%s bus ID: %d\n", adapter_str, cpi->bus_id); fprintf(stdout, "%s base transfer speed: ", adapter_str); if (cpi->base_transfer_speed > 1000) fprintf(stdout, "%d.%03dMB/sec\n", cpi->base_transfer_speed / 1000, cpi->base_transfer_speed % 1000); else fprintf(stdout, "%dKB/sec\n", (cpi->base_transfer_speed % 1000) * 1000); fprintf(stdout, "%s maximum transfer size: %u bytes\n", adapter_str, cpi->maxio); } static int get_print_cts(struct cam_device *device, int user_settings, int quiet, struct ccb_trans_settings *cts) { int retval; union ccb *ccb; retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("get_print_cts: error allocating ccb"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts); ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS; if (user_settings == 0) ccb->cts.type = CTS_TYPE_CURRENT_SETTINGS; else ccb->cts.type = CTS_TYPE_USER_SETTINGS; if (cam_send_ccb(device, ccb) < 0) { perror("error sending XPT_GET_TRAN_SETTINGS CCB"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_print_cts_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_GET_TRANS_SETTINGS CCB failed"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto get_print_cts_bailout; } if (quiet == 0) cts_print(device, &ccb->cts); if (cts != NULL) bcopy(&ccb->cts, cts, sizeof(struct ccb_trans_settings)); get_print_cts_bailout: cam_freeccb(ccb); return (retval); } static int ratecontrol(struct cam_device *device, int task_attr, int retry_count, int timeout, int argc, char **argv, char *combinedopt) { int c; union ccb *ccb; int user_settings = 0; int retval = 0; int disc_enable = -1, tag_enable = -1; int mode = -1; int offset = -1; double syncrate = -1; int bus_width = -1; int quiet = 0; int change_settings = 0, send_tur = 0; struct ccb_pathinq cpi; ccb = cam_getccb(device); if (ccb == NULL) { warnx("ratecontrol: error allocating ccb"); return (1); } while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c){ case 'a': send_tur = 1; break; case 'c': user_settings = 0; break; case 'D': if (strncasecmp(optarg, "enable", 6) == 0) disc_enable = 1; else if (strncasecmp(optarg, "disable", 7) == 0) disc_enable = 0; else { warnx("-D argument \"%s\" is unknown", optarg); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; case 'M': mode = ata_string2mode(optarg); if (mode < 0) { warnx("unknown mode '%s'", optarg); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; case 'O': offset = strtol(optarg, NULL, 0); if (offset < 0) { warnx("offset value %d is < 0", offset); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; case 'q': quiet++; break; case 'R': syncrate = atof(optarg); if (syncrate < 0) { warnx("sync rate %f is < 0", syncrate); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; case 'T': if (strncasecmp(optarg, "enable", 6) == 0) tag_enable = 1; else if (strncasecmp(optarg, "disable", 7) == 0) tag_enable = 0; else { warnx("-T argument \"%s\" is unknown", optarg); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; case 'U': user_settings = 1; break; case 'W': bus_width = strtol(optarg, NULL, 0); if (bus_width < 0) { warnx("bus width %d is < 0", bus_width); retval = 1; goto ratecontrol_bailout; } change_settings = 1; break; default: break; } } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi); /* * Grab path inquiry information, so we can determine whether * or not the initiator is capable of the things that the user * requests. */ ccb->ccb_h.func_code = XPT_PATH_INQ; if (cam_send_ccb(device, ccb) < 0) { perror("error sending XPT_PATH_INQ CCB"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto ratecontrol_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_PATH_INQ CCB failed"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto ratecontrol_bailout; } bcopy(&ccb->cpi, &cpi, sizeof(struct ccb_pathinq)); CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts); if (quiet == 0) { fprintf(stdout, "%s parameters:\n", user_settings ? "User" : "Current"); } retval = get_print_cts(device, user_settings, quiet, &ccb->cts); if (retval != 0) goto ratecontrol_bailout; if (arglist & CAM_ARG_VERBOSE) cpi_print(&cpi); if (change_settings) { int didsettings = 0; struct ccb_trans_settings_spi *spi = NULL; struct ccb_trans_settings_pata *pata = NULL; struct ccb_trans_settings_sata *sata = NULL; struct ccb_trans_settings_ata *ata = NULL; struct ccb_trans_settings_scsi *scsi = NULL; if (ccb->cts.transport == XPORT_SPI) spi = &ccb->cts.xport_specific.spi; if (ccb->cts.transport == XPORT_ATA) pata = &ccb->cts.xport_specific.ata; if (ccb->cts.transport == XPORT_SATA) sata = &ccb->cts.xport_specific.sata; if (ccb->cts.protocol == PROTO_ATA) ata = &ccb->cts.proto_specific.ata; if (ccb->cts.protocol == PROTO_SCSI) scsi = &ccb->cts.proto_specific.scsi; ccb->cts.xport_specific.valid = 0; ccb->cts.proto_specific.valid = 0; if (spi && disc_enable != -1) { spi->valid |= CTS_SPI_VALID_DISC; if (disc_enable == 0) spi->flags &= ~CTS_SPI_FLAGS_DISC_ENB; else spi->flags |= CTS_SPI_FLAGS_DISC_ENB; didsettings++; } if (tag_enable != -1) { if ((cpi.hba_inquiry & PI_TAG_ABLE) == 0) { warnx("HBA does not support tagged queueing, " "so you cannot modify tag settings"); retval = 1; goto ratecontrol_bailout; } if (ata) { ata->valid |= CTS_SCSI_VALID_TQ; if (tag_enable == 0) ata->flags &= ~CTS_ATA_FLAGS_TAG_ENB; else ata->flags |= CTS_ATA_FLAGS_TAG_ENB; didsettings++; } else if (scsi) { scsi->valid |= CTS_SCSI_VALID_TQ; if (tag_enable == 0) scsi->flags &= ~CTS_SCSI_FLAGS_TAG_ENB; else scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB; didsettings++; } } if (spi && offset != -1) { if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) { warnx("HBA is not capable of changing offset"); retval = 1; goto ratecontrol_bailout; } spi->valid |= CTS_SPI_VALID_SYNC_OFFSET; spi->sync_offset = offset; didsettings++; } if (spi && syncrate != -1) { int prelim_sync_period; if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) { warnx("HBA is not capable of changing " "transfer rates"); retval = 1; goto ratecontrol_bailout; } spi->valid |= CTS_SPI_VALID_SYNC_RATE; /* * The sync rate the user gives us is in MHz. * We need to translate it into KHz for this * calculation. */ syncrate *= 1000; /* * Next, we calculate a "preliminary" sync period * in tenths of a nanosecond. */ if (syncrate == 0) prelim_sync_period = 0; else prelim_sync_period = 10000000 / syncrate; spi->sync_period = scsi_calc_syncparam(prelim_sync_period); didsettings++; } if (sata && syncrate != -1) { if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) { warnx("HBA is not capable of changing " "transfer rates"); retval = 1; goto ratecontrol_bailout; } if (!user_settings) { warnx("You can modify only user rate " "settings for SATA"); retval = 1; goto ratecontrol_bailout; } sata->revision = ata_speed2revision(syncrate * 100); if (sata->revision < 0) { warnx("Invalid rate %f", syncrate); retval = 1; goto ratecontrol_bailout; } sata->valid |= CTS_SATA_VALID_REVISION; didsettings++; } if ((pata || sata) && mode != -1) { if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) { warnx("HBA is not capable of changing " "transfer rates"); retval = 1; goto ratecontrol_bailout; } if (!user_settings) { warnx("You can modify only user mode " "settings for ATA/SATA"); retval = 1; goto ratecontrol_bailout; } if (pata) { pata->mode = mode; pata->valid |= CTS_ATA_VALID_MODE; } else { sata->mode = mode; sata->valid |= CTS_SATA_VALID_MODE; } didsettings++; } /* * The bus_width argument goes like this: * 0 == 8 bit * 1 == 16 bit * 2 == 32 bit * Therefore, if you shift the number of bits given on the * command line right by 4, you should get the correct * number. */ if (spi && bus_width != -1) { /* * We might as well validate things here with a * decipherable error message, rather than what * will probably be an indecipherable error message * by the time it gets back to us. */ if ((bus_width == 16) && ((cpi.hba_inquiry & PI_WIDE_16) == 0)) { warnx("HBA does not support 16 bit bus width"); retval = 1; goto ratecontrol_bailout; } else if ((bus_width == 32) && ((cpi.hba_inquiry & PI_WIDE_32) == 0)) { warnx("HBA does not support 32 bit bus width"); retval = 1; goto ratecontrol_bailout; } else if ((bus_width != 8) && (bus_width != 16) && (bus_width != 32)) { warnx("Invalid bus width %d", bus_width); retval = 1; goto ratecontrol_bailout; } spi->valid |= CTS_SPI_VALID_BUS_WIDTH; spi->bus_width = bus_width >> 4; didsettings++; } if (didsettings == 0) { goto ratecontrol_bailout; } ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS; if (cam_send_ccb(device, ccb) < 0) { perror("error sending XPT_SET_TRAN_SETTINGS CCB"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto ratecontrol_bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { warnx("XPT_SET_TRANS_SETTINGS CCB failed"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto ratecontrol_bailout; } } if (send_tur) { retval = testunitready(device, task_attr, retry_count, timeout, (arglist & CAM_ARG_VERBOSE) ? 0 : 1); /* * If the TUR didn't succeed, just bail. */ if (retval != 0) { if (quiet == 0) fprintf(stderr, "Test Unit Ready failed\n"); goto ratecontrol_bailout; } } if ((change_settings || send_tur) && !quiet && (ccb->cts.transport == XPORT_ATA || ccb->cts.transport == XPORT_SATA || send_tur)) { fprintf(stdout, "New parameters:\n"); retval = get_print_cts(device, user_settings, 0, NULL); } ratecontrol_bailout: cam_freeccb(ccb); return (retval); } static int scsiformat(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb; int c; int ycount = 0, quiet = 0; int error = 0, retval = 0; int use_timeout = 10800 * 1000; int immediate = 1; struct format_defect_list_header fh; u_int8_t *data_ptr = NULL; u_int32_t dxfer_len = 0; u_int8_t byte2 = 0; int num_warnings = 0; int reportonly = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("scsiformat: error allocating ccb"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'q': quiet++; break; case 'r': reportonly = 1; break; case 'w': immediate = 0; break; case 'y': ycount++; break; } } if (reportonly) goto doreport; if (quiet == 0) { fprintf(stdout, "You are about to REMOVE ALL DATA from the " "following device:\n"); error = scsidoinquiry(device, argc, argv, combinedopt, task_attr, retry_count, timeout); if (error != 0) { warnx("scsiformat: error sending inquiry"); goto scsiformat_bailout; } } if (ycount == 0) { if (!get_confirmation()) { error = 1; goto scsiformat_bailout; } } if (timeout != 0) use_timeout = timeout; if (quiet == 0) { fprintf(stdout, "Current format timeout is %d seconds\n", use_timeout / 1000); } /* * If the user hasn't disabled questions and didn't specify a * timeout on the command line, ask them if they want the current * timeout. */ if ((ycount == 0) && (timeout == 0)) { char str[1024]; int new_timeout = 0; fprintf(stdout, "Enter new timeout in seconds or press\n" "return to keep the current timeout [%d] ", use_timeout / 1000); if (fgets(str, sizeof(str), stdin) != NULL) { if (str[0] != '\0') new_timeout = atoi(str); } if (new_timeout != 0) { use_timeout = new_timeout * 1000; fprintf(stdout, "Using new timeout value %d\n", use_timeout / 1000); } } /* * Keep this outside the if block below to silence any unused * variable warnings. */ bzero(&fh, sizeof(fh)); /* * If we're in immediate mode, we've got to include the format * header */ if (immediate != 0) { fh.byte2 = FU_DLH_IMMED; data_ptr = (u_int8_t *)&fh; dxfer_len = sizeof(fh); byte2 = FU_FMT_DATA; } else if (quiet == 0) { fprintf(stdout, "Formatting..."); fflush(stdout); } scsi_format_unit(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* byte2 */ byte2, /* ileave */ 0, /* data_ptr */ data_ptr, /* dxfer_len */ dxfer_len, /* sense_len */ SSD_FULL_SIZE, /* timeout */ use_timeout); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((immediate == 0) && ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP))) { const char errstr[] = "error sending format command"; if (retval < 0) warn(errstr); else warnx(errstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto scsiformat_bailout; } /* * If we ran in non-immediate mode, we already checked for errors * above and printed out any necessary information. If we're in * immediate mode, we need to loop through and get status * information periodically. */ if (immediate == 0) { if (quiet == 0) { fprintf(stdout, "Format Complete\n"); } goto scsiformat_bailout; } doreport: do { cam_status status; CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); /* * There's really no need to do error recovery or * retries here, since we're just going to sit in a * loop and wait for the device to finish formatting. */ scsi_test_unit_ready(&ccb->csio, /* retries */ 0, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; retval = cam_send_ccb(device, ccb); /* * If we get an error from the ioctl, bail out. SCSI * errors are expected. */ if (retval < 0) { warn("error sending CAMIOCOMMAND ioctl"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto scsiformat_bailout; } status = ccb->ccb_h.status & CAM_STATUS_MASK; if ((status != CAM_REQ_CMP) && (status == CAM_SCSI_STATUS_ERROR) && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) { struct scsi_sense_data *sense; int error_code, sense_key, asc, ascq; sense = &ccb->csio.sense_data; scsi_extract_sense_len(sense, ccb->csio.sense_len - ccb->csio.sense_resid, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); /* * According to the SCSI-2 and SCSI-3 specs, a * drive that is in the middle of a format should * return NOT READY with an ASC of "logical unit * not ready, format in progress". The sense key * specific bytes will then be a progress indicator. */ if ((sense_key == SSD_KEY_NOT_READY) && (asc == 0x04) && (ascq == 0x04)) { uint8_t sks[3]; if ((scsi_get_sks(sense, ccb->csio.sense_len - ccb->csio.sense_resid, sks) == 0) && (quiet == 0)) { uint32_t val; u_int64_t percentage; val = scsi_2btoul(&sks[1]); percentage = 10000ull * val; fprintf(stdout, "\rFormatting: %ju.%02u %% " "(%u/%d) done", (uintmax_t)(percentage / (0x10000 * 100)), (unsigned)((percentage / 0x10000) % 100), val, 0x10000); fflush(stdout); } else if ((quiet == 0) && (++num_warnings <= 1)) { warnx("Unexpected SCSI Sense Key " "Specific value returned " "during format:"); scsi_sense_print(device, &ccb->csio, stderr); warnx("Unable to print status " "information, but format will " "proceed."); warnx("will exit when format is " "complete"); } sleep(1); } else { warnx("Unexpected SCSI error during format"); cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto scsiformat_bailout; } } else if (status != CAM_REQ_CMP) { warnx("Unexpected CAM status %#x", status); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto scsiformat_bailout; } } while((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP); if (quiet == 0) fprintf(stdout, "\nFormat Complete\n"); scsiformat_bailout: cam_freeccb(ccb); return (error); } static int sanitize_wait_ata(struct cam_device *device, union ccb *ccb, int quiet) { struct ata_res *res; int retval; cam_status status; u_int val, perc; do { retval = ata_do_cmd(device, ccb, /*retries*/1, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SANITIZE, /*features*/0x00, /* SANITIZE STATUS EXT */ /*lba*/0, /*sector_count*/0, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/10000, /*is48bit*/1); if (retval < 0) { warn("error sending CAMIOCOMMAND ioctl"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } status = ccb->ccb_h.status & CAM_STATUS_MASK; if (status == CAM_REQ_CMP) { res = &ccb->ataio.res; if (res->sector_count_exp & 0x40) { if (quiet == 0) { val = (res->lba_mid << 8) + res->lba_low; perc = 10000 * val; fprintf(stdout, "Sanitizing: %u.%02u%% (%d/%d)\r", (perc / (0x10000 * 100)), ((perc / 0x10000) % 100), val, 0x10000); fflush(stdout); } sleep(1); } else if ((res->sector_count_exp & 0x80) == 0) { warnx("Sanitize complete with an error. "); return (1); } else break; } else if (status != CAM_REQ_CMP && status != CAM_REQUEUE_REQ) { warnx("Unexpected CAM status %#x", status); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); return (1); } } while (1); return (0); } static int sanitize_wait_scsi(struct cam_device *device, union ccb *ccb, int task_attr, int quiet) { int warnings = 0, retval; cam_status status; u_int val, perc; do { CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); /* * There's really no need to do error recovery or * retries here, since we're just going to sit in a * loop and wait for the device to finish sanitizing. */ scsi_test_unit_ready(&ccb->csio, /* retries */ 0, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* sense_len */ SSD_FULL_SIZE, /* timeout */ 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; retval = cam_send_ccb(device, ccb); /* * If we get an error from the ioctl, bail out. SCSI * errors are expected. */ if (retval < 0) { warn("error sending CAMIOCOMMAND ioctl"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } return (1); } status = ccb->ccb_h.status & CAM_STATUS_MASK; if ((status == CAM_SCSI_STATUS_ERROR) && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) { struct scsi_sense_data *sense; int error_code, sense_key, asc, ascq; sense = &ccb->csio.sense_data; scsi_extract_sense_len(sense, ccb->csio.sense_len - ccb->csio.sense_resid, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); /* * According to the SCSI-3 spec, a drive that is in the * middle of a sanitize should return NOT READY with an * ASC of "logical unit not ready, sanitize in * progress". The sense key specific bytes will then * be a progress indicator. */ if ((sense_key == SSD_KEY_NOT_READY) && (asc == 0x04) && (ascq == 0x1b)) { uint8_t sks[3]; if ((scsi_get_sks(sense, ccb->csio.sense_len - ccb->csio.sense_resid, sks) == 0) && (quiet == 0)) { val = scsi_2btoul(&sks[1]); perc = 10000 * val; fprintf(stdout, "Sanitizing: %u.%02u%% (%d/%d)\r", (perc / (0x10000 * 100)), ((perc / 0x10000) % 100), val, 0x10000); fflush(stdout); } else if ((quiet == 0) && (++warnings <= 1)) { warnx("Unexpected SCSI Sense Key " "Specific value returned " "during sanitize:"); scsi_sense_print(device, &ccb->csio, stderr); warnx("Unable to print status " "information, but sanitze will " "proceed."); warnx("will exit when sanitize is " "complete"); } sleep(1); } else { warnx("Unexpected SCSI error during sanitize"); cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); return (1); } } else if (status != CAM_REQ_CMP && status != CAM_REQUEUE_REQ) { warnx("Unexpected CAM status %#x", status); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); return (1); } } while ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP); return (0); } static int sanitize(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb; u_int8_t action = 0; int c; int ycount = 0, quiet = 0; int error = 0; int use_timeout; int immediate = 1; int invert = 0; int passes = 0; int ause = 0; int fd = -1; const char *pattern = NULL; u_int8_t *data_ptr = NULL; u_int32_t dxfer_len = 0; uint8_t byte2; uint16_t feature, count; uint64_t lba; int reportonly = 0; camcontrol_devtype dt; /* * Get the device type, request no I/O be done to do this. */ error = get_device_type(device, -1, 0, 0, &dt); if (error != 0 || (unsigned)dt > CC_DT_UNKNOWN) { warnx("sanitize: can't get device type"); return (1); } ccb = cam_getccb(device); if (ccb == NULL) { warnx("sanitize: error allocating ccb"); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch(c) { case 'a': if (strcasecmp(optarg, "overwrite") == 0) action = SSZ_SERVICE_ACTION_OVERWRITE; else if (strcasecmp(optarg, "block") == 0) action = SSZ_SERVICE_ACTION_BLOCK_ERASE; else if (strcasecmp(optarg, "crypto") == 0) action = SSZ_SERVICE_ACTION_CRYPTO_ERASE; else if (strcasecmp(optarg, "exitfailure") == 0) action = SSZ_SERVICE_ACTION_EXIT_MODE_FAILURE; else { warnx("invalid service operation \"%s\"", optarg); error = 1; goto sanitize_bailout; } break; case 'c': passes = strtol(optarg, NULL, 0); if (passes < 1 || passes > 31) { warnx("invalid passes value %d", passes); error = 1; goto sanitize_bailout; } break; case 'I': invert = 1; break; case 'P': pattern = optarg; break; case 'q': quiet++; break; case 'U': ause = 1; break; case 'r': reportonly = 1; break; case 'w': /* ATA supports only immediate commands. */ if (dt == CC_DT_SCSI) immediate = 0; break; case 'y': ycount++; break; } } if (reportonly) goto doreport; if (action == 0) { warnx("an action is required"); error = 1; goto sanitize_bailout; } else if (action == SSZ_SERVICE_ACTION_OVERWRITE) { struct scsi_sanitize_parameter_list *pl; struct stat sb; ssize_t sz, amt; if (pattern == NULL) { warnx("overwrite action requires -P argument"); error = 1; goto sanitize_bailout; } fd = open(pattern, O_RDONLY); if (fd < 0) { warn("cannot open pattern file %s", pattern); error = 1; goto sanitize_bailout; } if (fstat(fd, &sb) < 0) { warn("cannot stat pattern file %s", pattern); error = 1; goto sanitize_bailout; } sz = sb.st_size; if (sz > SSZPL_MAX_PATTERN_LENGTH) { warnx("pattern file size exceeds maximum value %d", SSZPL_MAX_PATTERN_LENGTH); error = 1; goto sanitize_bailout; } dxfer_len = sizeof(*pl) + sz; data_ptr = calloc(1, dxfer_len); if (data_ptr == NULL) { warnx("cannot allocate parameter list buffer"); error = 1; goto sanitize_bailout; } amt = read(fd, data_ptr + sizeof(*pl), sz); if (amt < 0) { warn("cannot read pattern file"); error = 1; goto sanitize_bailout; } else if (amt != sz) { warnx("short pattern file read"); error = 1; goto sanitize_bailout; } pl = (struct scsi_sanitize_parameter_list *)data_ptr; if (passes == 0) pl->byte1 = 1; else pl->byte1 = passes; if (invert != 0) pl->byte1 |= SSZPL_INVERT; scsi_ulto2b(sz, pl->length); } else { const char *arg; if (passes != 0) arg = "-c"; else if (invert != 0) arg = "-I"; else if (pattern != NULL) arg = "-P"; else arg = NULL; if (arg != NULL) { warnx("%s argument only valid with overwrite " "operation", arg); error = 1; goto sanitize_bailout; } } if (quiet == 0) { fprintf(stdout, "You are about to REMOVE ALL DATA from the " "following device:\n"); if (dt == CC_DT_SCSI) { error = scsidoinquiry(device, argc, argv, combinedopt, task_attr, retry_count, timeout); } else if (dt == CC_DT_ATA || dt == CC_DT_SATL) { struct ata_params *ident_buf; error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf); if (error == 0) { printf("%s%d: ", device->device_name, device->dev_unit_num); ata_print_ident(ident_buf); free(ident_buf); } } else error = 1; if (error != 0) { warnx("sanitize: error sending inquiry"); goto sanitize_bailout; } } if (ycount == 0) { if (!get_confirmation()) { error = 1; goto sanitize_bailout; } } if (timeout != 0) use_timeout = timeout; else use_timeout = (immediate ? 10 : 10800) * 1000; if (immediate == 0 && quiet == 0) { fprintf(stdout, "Current sanitize timeout is %d seconds\n", use_timeout / 1000); } /* * If the user hasn't disabled questions and didn't specify a * timeout on the command line, ask them if they want the current * timeout. */ if (immediate == 0 && ycount == 0 && timeout == 0) { char str[1024]; int new_timeout = 0; fprintf(stdout, "Enter new timeout in seconds or press\n" "return to keep the current timeout [%d] ", use_timeout / 1000); if (fgets(str, sizeof(str), stdin) != NULL) { if (str[0] != '\0') new_timeout = atoi(str); } if (new_timeout != 0) { use_timeout = new_timeout * 1000; fprintf(stdout, "Using new timeout value %d\n", use_timeout / 1000); } } if (dt == CC_DT_SCSI) { byte2 = action; if (ause != 0) byte2 |= SSZ_UNRESTRICTED_EXIT; if (immediate != 0) byte2 |= SSZ_IMMED; scsi_sanitize(&ccb->csio, /* retries */ retry_count, /* cbfcnp */ NULL, /* tag_action */ task_attr, /* byte2 */ byte2, /* control */ 0, /* data_ptr */ data_ptr, /* dxfer_len */ dxfer_len, /* sense_len */ SSD_FULL_SIZE, /* timeout */ use_timeout); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { warn("error sending sanitize command"); error = 1; goto sanitize_bailout; } } else if (dt == CC_DT_ATA || dt == CC_DT_SATL) { if (action == SSZ_SERVICE_ACTION_OVERWRITE) { feature = 0x14; /* OVERWRITE EXT */ lba = 0x4F5700000000 | scsi_4btoul(data_ptr + 4); count = (passes == 0) ? 1 : (passes >= 16) ? 0 : passes; if (invert) count |= 0x80; /* INVERT PATTERN */ if (ause) count |= 0x10; /* FAILURE MODE */ } else if (action == SSZ_SERVICE_ACTION_BLOCK_ERASE) { feature = 0x12; /* BLOCK ERASE EXT */ lba = 0x0000426B4572; count = 0; if (ause) count |= 0x10; /* FAILURE MODE */ } else if (action == SSZ_SERVICE_ACTION_CRYPTO_ERASE) { feature = 0x11; /* CRYPTO SCRAMBLE EXT */ lba = 0x000043727970; count = 0; if (ause) count |= 0x10; /* FAILURE MODE */ } else if (action == SSZ_SERVICE_ACTION_EXIT_MODE_FAILURE) { feature = 0x00; /* SANITIZE STATUS EXT */ lba = 0; count = 1; /* CLEAR SANITIZE OPERATION FAILED */ } else { error = 1; goto sanitize_bailout; } error = ata_do_cmd(device, ccb, retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND, /*ata_flags*/AP_FLAG_CHK_COND, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SANITIZE, /*features*/feature, /*lba*/lba, /*sector_count*/count, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/ use_timeout, /*is48bit*/1); } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { struct scsi_sense_data *sense; int error_code, sense_key, asc, ascq; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR) { sense = &ccb->csio.sense_data; scsi_extract_sense_len(sense, ccb->csio.sense_len - ccb->csio.sense_resid, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); if (sense_key == SSD_KEY_ILLEGAL_REQUEST && asc == 0x20 && ascq == 0x00) warnx("sanitize is not supported by " "this device"); else warnx("error sanitizing this device"); } else warnx("error sanitizing this device"); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto sanitize_bailout; } /* * If we ran in non-immediate mode, we already checked for errors * above and printed out any necessary information. If we're in * immediate mode, we need to loop through and get status * information periodically. */ if (immediate == 0) { if (quiet == 0) { fprintf(stdout, "Sanitize Complete\n"); } goto sanitize_bailout; } doreport: if (dt == CC_DT_SCSI) { error = sanitize_wait_scsi(device, ccb, task_attr, quiet); } else if (dt == CC_DT_ATA || dt == CC_DT_SATL) { error = sanitize_wait_ata(device, ccb, quiet); } else error = 1; if (error == 0 && quiet == 0) fprintf(stdout, "Sanitize Complete \n"); sanitize_bailout: if (fd >= 0) close(fd); if (data_ptr != NULL) free(data_ptr); cam_freeccb(ccb); return (error); } static int scsireportluns(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb; int c, countonly, lunsonly; struct scsi_report_luns_data *lundata; int alloc_len; uint8_t report_type; uint32_t list_len, i, j; int retval; retval = 0; lundata = NULL; report_type = RPL_REPORT_DEFAULT; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); countonly = 0; lunsonly = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'c': countonly++; break; case 'l': lunsonly++; break; case 'r': if (strcasecmp(optarg, "default") == 0) report_type = RPL_REPORT_DEFAULT; else if (strcasecmp(optarg, "wellknown") == 0) report_type = RPL_REPORT_WELLKNOWN; else if (strcasecmp(optarg, "all") == 0) report_type = RPL_REPORT_ALL; else { warnx("%s: invalid report type \"%s\"", __func__, optarg); retval = 1; goto bailout; } break; default: break; } } if ((countonly != 0) && (lunsonly != 0)) { warnx("%s: you can only specify one of -c or -l", __func__); retval = 1; goto bailout; } /* * According to SPC-4, the allocation length must be at least 16 * bytes -- enough for the header and one LUN. */ alloc_len = sizeof(*lundata) + 8; retry: lundata = malloc(alloc_len); if (lundata == NULL) { warn("%s: error mallocing %d bytes", __func__, alloc_len); retval = 1; goto bailout; } scsi_report_luns(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*select_report*/ report_type, /*rpl_buf*/ lundata, /*alloc_len*/ alloc_len, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { warn("error sending REPORT LUNS command"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } list_len = scsi_4btoul(lundata->length); /* * If we need to list the LUNs, and our allocation * length was too short, reallocate and retry. */ if ((countonly == 0) && (list_len > (alloc_len - sizeof(*lundata)))) { alloc_len = list_len + sizeof(*lundata); free(lundata); goto retry; } if (lunsonly == 0) fprintf(stdout, "%u LUN%s found\n", list_len / 8, ((list_len / 8) > 1) ? "s" : ""); if (countonly != 0) goto bailout; for (i = 0; i < (list_len / 8); i++) { int no_more; no_more = 0; for (j = 0; j < sizeof(lundata->luns[i].lundata); j += 2) { if (j != 0) fprintf(stdout, ","); switch (lundata->luns[i].lundata[j] & RPL_LUNDATA_ATYP_MASK) { case RPL_LUNDATA_ATYP_PERIPH: if ((lundata->luns[i].lundata[j] & RPL_LUNDATA_PERIPH_BUS_MASK) != 0) fprintf(stdout, "%d:", lundata->luns[i].lundata[j] & RPL_LUNDATA_PERIPH_BUS_MASK); else if ((j == 0) && ((lundata->luns[i].lundata[j+2] & RPL_LUNDATA_PERIPH_BUS_MASK) == 0)) no_more = 1; fprintf(stdout, "%d", lundata->luns[i].lundata[j+1]); break; case RPL_LUNDATA_ATYP_FLAT: { uint8_t tmplun[2]; tmplun[0] = lundata->luns[i].lundata[j] & RPL_LUNDATA_FLAT_LUN_MASK; tmplun[1] = lundata->luns[i].lundata[j+1]; fprintf(stdout, "%d", scsi_2btoul(tmplun)); no_more = 1; break; } case RPL_LUNDATA_ATYP_LUN: fprintf(stdout, "%d:%d:%d", (lundata->luns[i].lundata[j+1] & RPL_LUNDATA_LUN_BUS_MASK) >> 5, lundata->luns[i].lundata[j] & RPL_LUNDATA_LUN_TARG_MASK, lundata->luns[i].lundata[j+1] & RPL_LUNDATA_LUN_LUN_MASK); break; case RPL_LUNDATA_ATYP_EXTLUN: { int field_len_code, eam_code; eam_code = lundata->luns[i].lundata[j] & RPL_LUNDATA_EXT_EAM_MASK; field_len_code = (lundata->luns[i].lundata[j] & RPL_LUNDATA_EXT_LEN_MASK) >> 4; if ((eam_code == RPL_LUNDATA_EXT_EAM_WK) && (field_len_code == 0x00)) { fprintf(stdout, "%d", lundata->luns[i].lundata[j+1]); } else if ((eam_code == RPL_LUNDATA_EXT_EAM_NOT_SPEC) && (field_len_code == 0x03)) { uint8_t tmp_lun[8]; /* * This format takes up all 8 bytes. * If we aren't starting at offset 0, * that's a bug. */ if (j != 0) { fprintf(stdout, "Invalid " "offset %d for " "Extended LUN not " "specified format", j); no_more = 1; break; } bzero(tmp_lun, sizeof(tmp_lun)); bcopy(&lundata->luns[i].lundata[j+1], &tmp_lun[1], sizeof(tmp_lun) - 1); fprintf(stdout, "%#jx", (intmax_t)scsi_8btou64(tmp_lun)); no_more = 1; } else { fprintf(stderr, "Unknown Extended LUN" "Address method %#x, length " "code %#x", eam_code, field_len_code); no_more = 1; } break; } default: fprintf(stderr, "Unknown LUN address method " "%#x\n", lundata->luns[i].lundata[0] & RPL_LUNDATA_ATYP_MASK); break; } /* * For the flat addressing method, there are no * other levels after it. */ if (no_more != 0) break; } fprintf(stdout, "\n"); } bailout: cam_freeccb(ccb); free(lundata); return (retval); } static int scsireadcapacity(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout) { union ccb *ccb; int blocksizeonly, humanize, numblocks, quiet, sizeonly, baseten, longonly; struct scsi_read_capacity_data rcap; struct scsi_read_capacity_data_long rcaplong; uint64_t maxsector; uint32_t block_len; int retval; int c; blocksizeonly = 0; humanize = 0; longonly = 0; numblocks = 0; quiet = 0; sizeonly = 0; baseten = 0; retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'b': blocksizeonly++; break; case 'h': humanize++; baseten = 0; break; case 'H': humanize++; baseten++; break; case 'l': longonly++; break; case 'N': numblocks++; break; case 'q': quiet++; break; case 's': sizeonly++; break; default: break; } } if ((blocksizeonly != 0) && (numblocks != 0)) { warnx("%s: you can only specify one of -b or -N", __func__); retval = 1; goto bailout; } if ((blocksizeonly != 0) && (sizeonly != 0)) { warnx("%s: you can only specify one of -b or -s", __func__); retval = 1; goto bailout; } if ((humanize != 0) && (quiet != 0)) { warnx("%s: you can only specify one of -h/-H or -q", __func__); retval = 1; goto bailout; } if ((humanize != 0) && (blocksizeonly != 0)) { warnx("%s: you can only specify one of -h/-H or -b", __func__); retval = 1; goto bailout; } if (longonly != 0) goto long_only; scsi_read_capacity(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, &rcap, SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { warn("error sending READ CAPACITY command"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } maxsector = scsi_4btoul(rcap.addr); block_len = scsi_4btoul(rcap.length); /* * A last block of 2^32-1 means that the true capacity is over 2TB, * and we need to issue the long READ CAPACITY to get the real * capacity. Otherwise, we're all set. */ if (maxsector != 0xffffffff) goto do_print; long_only: scsi_read_capacity_16(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*lba*/ 0, /*reladdr*/ 0, /*pmi*/ 0, /*rcap_buf*/ (uint8_t *)&rcaplong, /*rcap_buf_len*/ sizeof(rcaplong), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAM_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { warn("error sending READ CAPACITY (16) command"); if (arglist & CAM_ARG_VERBOSE) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } maxsector = scsi_8btou64(rcaplong.addr); block_len = scsi_4btoul(rcaplong.length); do_print: if (blocksizeonly == 0) { /* * Humanize implies !quiet, and also implies numblocks. */ if (humanize != 0) { char tmpstr[6]; int64_t tmpbytes; int ret; tmpbytes = (maxsector + 1) * block_len; ret = humanize_number(tmpstr, sizeof(tmpstr), tmpbytes, "", HN_AUTOSCALE, HN_B | HN_DECIMAL | ((baseten != 0) ? HN_DIVISOR_1000 : 0)); if (ret == -1) { warnx("%s: humanize_number failed!", __func__); retval = 1; goto bailout; } fprintf(stdout, "Device Size: %s%s", tmpstr, (sizeonly == 0) ? ", " : "\n"); } else if (numblocks != 0) { fprintf(stdout, "%s%ju%s", (quiet == 0) ? "Blocks: " : "", (uintmax_t)maxsector + 1, (sizeonly == 0) ? ", " : "\n"); } else { fprintf(stdout, "%s%ju%s", (quiet == 0) ? "Last Block: " : "", (uintmax_t)maxsector, (sizeonly == 0) ? ", " : "\n"); } } if (sizeonly == 0) fprintf(stdout, "%s%u%s\n", (quiet == 0) ? "Block Length: " : "", block_len, (quiet == 0) ? " bytes" : ""); bailout: cam_freeccb(ccb); return (retval); } static int smpcmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { int c, error = 0; union ccb *ccb; uint8_t *smp_request = NULL, *smp_response = NULL; int request_size = 0, response_size = 0; int fd_request = 0, fd_response = 0; char *datastr = NULL; struct get_hook hook; int retval; int flags = 0; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'R': arglist |= CAM_ARG_CMD_IN; response_size = strtol(optarg, NULL, 0); if (response_size <= 0) { warnx("invalid number of response bytes %d", response_size); error = 1; goto smpcmd_bailout; } hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; optind++; datastr = cget(&hook, NULL); /* * If the user supplied "-" instead of a format, he * wants the data to be written to stdout. */ if ((datastr != NULL) && (datastr[0] == '-')) fd_response = 1; smp_response = (u_int8_t *)malloc(response_size); if (smp_response == NULL) { warn("can't malloc memory for SMP response"); error = 1; goto smpcmd_bailout; } break; case 'r': arglist |= CAM_ARG_CMD_OUT; request_size = strtol(optarg, NULL, 0); if (request_size <= 0) { warnx("invalid number of request bytes %d", request_size); error = 1; goto smpcmd_bailout; } hook.argc = argc - optind; hook.argv = argv + optind; hook.got = 0; datastr = cget(&hook, NULL); smp_request = (u_int8_t *)malloc(request_size); if (smp_request == NULL) { warn("can't malloc memory for SMP request"); error = 1; goto smpcmd_bailout; } bzero(smp_request, request_size); /* * If the user supplied "-" instead of a format, he * wants the data to be read from stdin. */ if ((datastr != NULL) && (datastr[0] == '-')) fd_request = 1; else buff_encode_visit(smp_request, request_size, datastr, iget, &hook); optind += hook.got; break; default: break; } } /* * If fd_data is set, and we're writing to the device, we need to * read the data the user wants written from stdin. */ if ((fd_request == 1) && (arglist & CAM_ARG_CMD_OUT)) { ssize_t amt_read; int amt_to_read = request_size; u_int8_t *buf_ptr = smp_request; for (amt_read = 0; amt_to_read > 0; amt_read = read(STDIN_FILENO, buf_ptr, amt_to_read)) { if (amt_read == -1) { warn("error reading data from stdin"); error = 1; goto smpcmd_bailout; } amt_to_read -= amt_read; buf_ptr += amt_read; } } if (((arglist & CAM_ARG_CMD_IN) == 0) || ((arglist & CAM_ARG_CMD_OUT) == 0)) { warnx("%s: need both the request (-r) and response (-R) " "arguments", __func__); error = 1; goto smpcmd_bailout; } flags |= CAM_DEV_QFRZDIS; cam_fill_smpio(&ccb->smpio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*smp_request*/ smp_request, /*smp_request_len*/ request_size, /*smp_response*/ smp_response, /*smp_response_len*/ response_size, /*timeout*/ timeout ? timeout : 5000); ccb->smpio.flags = SMP_FLAG_NONE; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) && (response_size > 0)) { if (fd_response == 0) { buff_decode_visit(smp_response, response_size, datastr, arg_put, NULL); fprintf(stdout, "\n"); } else { ssize_t amt_written; int amt_to_write = response_size; u_int8_t *buf_ptr = smp_response; for (amt_written = 0; (amt_to_write > 0) && (amt_written = write(STDOUT_FILENO, buf_ptr, amt_to_write)) > 0;){ amt_to_write -= amt_written; buf_ptr += amt_written; } if (amt_written == -1) { warn("error writing data to stdout"); error = 1; goto smpcmd_bailout; } else if ((amt_written == 0) && (amt_to_write > 0)) { warnx("only wrote %u bytes out of %u", response_size - amt_to_write, response_size); } } } smpcmd_bailout: if (ccb != NULL) cam_freeccb(ccb); if (smp_request != NULL) free(smp_request); if (smp_response != NULL) free(smp_response); return (error); } static int mmcsdcmd(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { int c, error = 0; union ccb *ccb; int32_t mmc_opcode = 0, mmc_arg = 0; int32_t mmc_flags = -1; int retval; int is_write = 0; int is_bw_4 = 0, is_bw_1 = 0; int is_highspeed = 0, is_stdspeed = 0; int is_info_request = 0; int flags = 0; uint8_t mmc_data_byte = 0; /* For IO_RW_EXTENDED command */ uint8_t *mmc_data = NULL; struct mmc_data mmc_d; int mmc_data_len = 0; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } bzero(&(&ccb->ccb_h)[1], sizeof(union ccb) - sizeof(struct ccb_hdr)); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case '4': is_bw_4 = 1; break; case '1': is_bw_1 = 1; break; case 'S': if (!strcmp(optarg, "high")) is_highspeed = 1; else is_stdspeed = 1; break; case 'I': is_info_request = 1; break; case 'c': mmc_opcode = strtol(optarg, NULL, 0); if (mmc_opcode < 0) { warnx("invalid MMC opcode %d", mmc_opcode); error = 1; goto mmccmd_bailout; } break; case 'a': mmc_arg = strtol(optarg, NULL, 0); if (mmc_arg < 0) { warnx("invalid MMC arg %d", mmc_arg); error = 1; goto mmccmd_bailout; } break; case 'f': mmc_flags = strtol(optarg, NULL, 0); if (mmc_flags < 0) { warnx("invalid MMC flags %d", mmc_flags); error = 1; goto mmccmd_bailout; } break; case 'l': mmc_data_len = strtol(optarg, NULL, 0); if (mmc_data_len <= 0) { warnx("invalid MMC data len %d", mmc_data_len); error = 1; goto mmccmd_bailout; } break; case 'W': is_write = 1; break; case 'b': mmc_data_byte = strtol(optarg, NULL, 0); break; default: break; } } flags |= CAM_DEV_QFRZDIS; /* masks are broken?! */ /* If flags are left default, supply the right flags */ if (mmc_flags < 0) switch (mmc_opcode) { case MMC_GO_IDLE_STATE: mmc_flags = MMC_RSP_NONE | MMC_CMD_BC; break; case IO_SEND_OP_COND: mmc_flags = MMC_RSP_R4; break; case SD_SEND_RELATIVE_ADDR: mmc_flags = MMC_RSP_R6 | MMC_CMD_BCR; break; case MMC_SELECT_CARD: mmc_flags = MMC_RSP_R1B | MMC_CMD_AC; mmc_arg = mmc_arg << 16; break; case SD_IO_RW_DIRECT: mmc_flags = MMC_RSP_R5 | MMC_CMD_AC; mmc_arg = SD_IO_RW_ADR(mmc_arg); if (is_write) mmc_arg |= SD_IO_RW_WR | SD_IO_RW_RAW | SD_IO_RW_DAT(mmc_data_byte); break; case SD_IO_RW_EXTENDED: mmc_flags = MMC_RSP_R5 | MMC_CMD_ADTC; mmc_arg = SD_IO_RW_ADR(mmc_arg); int len_arg = mmc_data_len; if (mmc_data_len == 512) len_arg = 0; // Byte mode mmc_arg |= SD_IOE_RW_LEN(len_arg) | SD_IO_RW_INCR; // Block mode // mmc_arg |= SD_IOE_RW_BLK | SD_IOE_RW_LEN(len_arg) | SD_IO_RW_INCR; break; default: mmc_flags = MMC_RSP_R1; break; } // Switch bus width instead of sending IO command if (is_bw_4 || is_bw_1) { struct ccb_trans_settings_mmc *cts; ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS; ccb->ccb_h.flags = 0; cts = &ccb->cts.proto_specific.mmc; cts->ios.bus_width = is_bw_4 == 1 ? bus_width_4 : bus_width_1; cts->ios_valid = MMC_BW; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { warn("Error sending command"); } else { printf("Parameters set OK\n"); } cam_freeccb(ccb); return (retval); } // Switch bus speed instead of sending IO command if (is_stdspeed || is_highspeed) { struct ccb_trans_settings_mmc *cts; ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS; ccb->ccb_h.flags = 0; cts = &ccb->cts.proto_specific.mmc; cts->ios.timing = is_highspeed == 1 ? bus_timing_hs : bus_timing_normal; cts->ios_valid = MMC_BT; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { warn("Error sending command"); } else { printf("Speed set OK (HS: %d)\n", is_highspeed); } cam_freeccb(ccb); return (retval); } // Get information about controller and its settings if (is_info_request) { ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS; ccb->ccb_h.flags = 0; struct ccb_trans_settings_mmc *cts; cts = &ccb->cts.proto_specific.mmc; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { warn("Error sending command"); return (retval); } printf("Host controller information\n"); printf("Host OCR: 0x%x\n", cts->host_ocr); printf("Min frequency: %u KHz\n", cts->host_f_min / 1000); printf("Max frequency: %u MHz\n", cts->host_f_max / 1000000); printf("Supported bus width: "); if (cts->host_caps & MMC_CAP_4_BIT_DATA) printf(" 4 bit\n"); if (cts->host_caps & MMC_CAP_8_BIT_DATA) printf(" 8 bit\n"); printf("\nCurrent settings:\n"); printf("Bus width: "); switch (cts->ios.bus_width) { case bus_width_1: printf("1 bit\n"); break; case bus_width_4: printf("4 bit\n"); break; case bus_width_8: printf("8 bit\n"); break; } printf("Freq: %d.%03d MHz%s\n", cts->ios.clock / 1000000, (cts->ios.clock / 1000) % 1000, cts->ios.timing == bus_timing_hs ? "(high-speed timing)" : ""); return (0); } printf("CMD %d arg %d flags %02x\n", mmc_opcode, mmc_arg, mmc_flags); if (mmc_data_len > 0) { flags |= CAM_DIR_IN; mmc_data = malloc(mmc_data_len); memset(mmc_data, 0, mmc_data_len); memset(&mmc_d, 0, sizeof(mmc_d)); mmc_d.len = mmc_data_len; mmc_d.data = mmc_data; mmc_d.flags = MMC_DATA_READ; } else flags |= CAM_DIR_NONE; cam_fill_mmcio(&ccb->mmcio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*flags*/ flags, /*mmc_opcode*/ mmc_opcode, /*mmc_arg*/ mmc_arg, /*mmc_flags*/ mmc_flags, /*mmc_data*/ mmc_data_len > 0 ? &mmc_d : NULL, /*timeout*/ timeout ? timeout : 5000); if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } } if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)) { printf("MMCIO: error %d, %08x %08x %08x %08x\n", ccb->mmcio.cmd.error, ccb->mmcio.cmd.resp[0], ccb->mmcio.cmd.resp[1], ccb->mmcio.cmd.resp[2], ccb->mmcio.cmd.resp[3]); switch (mmc_opcode) { case SD_IO_RW_DIRECT: printf("IO_RW_DIRECT: resp byte %02x, cur state %d\n", SD_R5_DATA(ccb->mmcio.cmd.resp), (ccb->mmcio.cmd.resp[0] >> 12) & 0x3); break; case SD_IO_RW_EXTENDED: printf("IO_RW_EXTENDED: read %d bytes w/o error:\n", mmc_data_len); hexdump(mmc_data, mmc_data_len, NULL, 0); break; case SD_SEND_RELATIVE_ADDR: printf("SEND_RELATIVE_ADDR: published RCA %02x\n", ccb->mmcio.cmd.resp[0] >> 16); break; default: printf("No command-specific decoder for CMD %d\n", mmc_opcode); } } mmccmd_bailout: if (ccb != NULL) cam_freeccb(ccb); if (mmc_data_len > 0 && mmc_data != NULL) free(mmc_data); return (error); } static int smpreportgeneral(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { union ccb *ccb; struct smp_report_general_request *request = NULL; struct smp_report_general_response *response = NULL; struct sbuf *sb = NULL; int error = 0; int c, long_response = 0; int retval; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'l': long_response = 1; break; default: break; } } request = malloc(sizeof(*request)); if (request == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*request)); error = 1; goto bailout; } response = malloc(sizeof(*response)); if (response == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*response)); error = 1; goto bailout; } try_long: smp_report_general(&ccb->smpio, retry_count, /*cbfcnp*/ NULL, request, /*request_len*/ sizeof(*request), (uint8_t *)response, /*response_len*/ sizeof(*response), /*long_response*/ long_response, timeout); if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } error = 1; goto bailout; } /* * If the device supports the long response bit, try again and see * if we can get all of the data. */ if ((response->long_response & SMP_RG_LONG_RESPONSE) && (long_response == 0)) { ccb->ccb_h.status = CAM_REQ_INPROG; CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); long_response = 1; goto try_long; } /* * XXX KDM detect and decode SMP errors here. */ sb = sbuf_new_auto(); if (sb == NULL) { warnx("%s: error allocating sbuf", __func__); goto bailout; } smp_report_general_sbuf(response, sizeof(*response), sb); if (sbuf_finish(sb) != 0) { warnx("%s: sbuf_finish", __func__); goto bailout; } printf("%s", sbuf_data(sb)); bailout: if (ccb != NULL) cam_freeccb(ccb); if (request != NULL) free(request); if (response != NULL) free(response); if (sb != NULL) sbuf_delete(sb); return (error); } static struct camcontrol_opts phy_ops[] = { {"nop", SMP_PC_PHY_OP_NOP, CAM_ARG_NONE, NULL}, {"linkreset", SMP_PC_PHY_OP_LINK_RESET, CAM_ARG_NONE, NULL}, {"hardreset", SMP_PC_PHY_OP_HARD_RESET, CAM_ARG_NONE, NULL}, {"disable", SMP_PC_PHY_OP_DISABLE, CAM_ARG_NONE, NULL}, {"clearerrlog", SMP_PC_PHY_OP_CLEAR_ERR_LOG, CAM_ARG_NONE, NULL}, {"clearaffiliation", SMP_PC_PHY_OP_CLEAR_AFFILIATON, CAM_ARG_NONE,NULL}, {"sataportsel", SMP_PC_PHY_OP_TRANS_SATA_PSS, CAM_ARG_NONE, NULL}, {"clearitnl", SMP_PC_PHY_OP_CLEAR_STP_ITN_LS, CAM_ARG_NONE, NULL}, {"setdevname", SMP_PC_PHY_OP_SET_ATT_DEV_NAME, CAM_ARG_NONE, NULL}, {NULL, 0, 0, NULL} }; static int smpphycontrol(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { union ccb *ccb; struct smp_phy_control_request *request = NULL; struct smp_phy_control_response *response = NULL; int long_response = 0; int retval = 0; int phy = -1; uint32_t phy_operation = SMP_PC_PHY_OP_NOP; int phy_op_set = 0; uint64_t attached_dev_name = 0; int dev_name_set = 0; uint32_t min_plr = 0, max_plr = 0; uint32_t pp_timeout_val = 0; int slumber_partial = 0; int set_pp_timeout_val = 0; int c; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'a': case 'A': case 's': case 'S': { int enable = -1; if (strcasecmp(optarg, "enable") == 0) enable = 1; else if (strcasecmp(optarg, "disable") == 0) enable = 2; else { warnx("%s: Invalid argument %s", __func__, optarg); retval = 1; goto bailout; } switch (c) { case 's': slumber_partial |= enable << SMP_PC_SAS_SLUMBER_SHIFT; break; case 'S': slumber_partial |= enable << SMP_PC_SAS_PARTIAL_SHIFT; break; case 'a': slumber_partial |= enable << SMP_PC_SATA_SLUMBER_SHIFT; break; case 'A': slumber_partial |= enable << SMP_PC_SATA_PARTIAL_SHIFT; break; default: warnx("%s: programmer error", __func__); retval = 1; goto bailout; break; /*NOTREACHED*/ } break; } case 'd': attached_dev_name = (uintmax_t)strtoumax(optarg, NULL,0); dev_name_set = 1; break; case 'l': long_response = 1; break; case 'm': /* * We don't do extensive checking here, so this * will continue to work when new speeds come out. */ min_plr = strtoul(optarg, NULL, 0); if ((min_plr == 0) || (min_plr > 0xf)) { warnx("%s: invalid link rate %x", __func__, min_plr); retval = 1; goto bailout; } break; case 'M': /* * We don't do extensive checking here, so this * will continue to work when new speeds come out. */ max_plr = strtoul(optarg, NULL, 0); if ((max_plr == 0) || (max_plr > 0xf)) { warnx("%s: invalid link rate %x", __func__, max_plr); retval = 1; goto bailout; } break; case 'o': { camcontrol_optret optreturn; cam_argmask argnums; const char *subopt; if (phy_op_set != 0) { warnx("%s: only one phy operation argument " "(-o) allowed", __func__); retval = 1; goto bailout; } phy_op_set = 1; /* * Allow the user to specify the phy operation * numerically, as well as with a name. This will * future-proof it a bit, so options that are added * in future specs can be used. */ if (isdigit(optarg[0])) { phy_operation = strtoul(optarg, NULL, 0); if ((phy_operation == 0) || (phy_operation > 0xff)) { warnx("%s: invalid phy operation %#x", __func__, phy_operation); retval = 1; goto bailout; } break; } optreturn = getoption(phy_ops, optarg, &phy_operation, &argnums, &subopt); if (optreturn == CC_OR_AMBIGUOUS) { warnx("%s: ambiguous option %s", __func__, optarg); usage(0); retval = 1; goto bailout; } else if (optreturn == CC_OR_NOT_FOUND) { warnx("%s: option %s not found", __func__, optarg); usage(0); retval = 1; goto bailout; } break; } case 'p': phy = atoi(optarg); break; case 'T': pp_timeout_val = strtoul(optarg, NULL, 0); if (pp_timeout_val > 15) { warnx("%s: invalid partial pathway timeout " "value %u, need a value less than 16", __func__, pp_timeout_val); retval = 1; goto bailout; } set_pp_timeout_val = 1; break; default: break; } } if (phy == -1) { warnx("%s: a PHY (-p phy) argument is required",__func__); retval = 1; goto bailout; } if (((dev_name_set != 0) && (phy_operation != SMP_PC_PHY_OP_SET_ATT_DEV_NAME)) || ((phy_operation == SMP_PC_PHY_OP_SET_ATT_DEV_NAME) && (dev_name_set == 0))) { warnx("%s: -d name and -o setdevname arguments both " "required to set device name", __func__); retval = 1; goto bailout; } request = malloc(sizeof(*request)); if (request == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*request)); retval = 1; goto bailout; } response = malloc(sizeof(*response)); if (response == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*response)); retval = 1; goto bailout; } smp_phy_control(&ccb->smpio, retry_count, /*cbfcnp*/ NULL, request, sizeof(*request), (uint8_t *)response, sizeof(*response), long_response, /*expected_exp_change_count*/ 0, phy, phy_operation, (set_pp_timeout_val != 0) ? 1 : 0, attached_dev_name, min_plr, max_plr, slumber_partial, pp_timeout_val, timeout); if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { /* * Use CAM_EPF_NORMAL so we only get one line of * SMP command decoding. */ cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_NORMAL, stderr); } retval = 1; goto bailout; } /* XXX KDM print out something here for success? */ bailout: if (ccb != NULL) cam_freeccb(ccb); if (request != NULL) free(request); if (response != NULL) free(response); return (retval); } static int smpmaninfo(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { union ccb *ccb; struct smp_report_manuf_info_request request; struct smp_report_manuf_info_response response; struct sbuf *sb = NULL; int long_response = 0; int retval = 0; int c; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'l': long_response = 1; break; default: break; } } bzero(&request, sizeof(request)); bzero(&response, sizeof(response)); smp_report_manuf_info(&ccb->smpio, retry_count, /*cbfcnp*/ NULL, &request, sizeof(request), (uint8_t *)&response, sizeof(response), long_response, timeout); if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto bailout; } sb = sbuf_new_auto(); if (sb == NULL) { warnx("%s: error allocating sbuf", __func__); goto bailout; } smp_report_manuf_info_sbuf(&response, sizeof(response), sb); if (sbuf_finish(sb) != 0) { warnx("%s: sbuf_finish", __func__); goto bailout; } printf("%s", sbuf_data(sb)); bailout: if (ccb != NULL) cam_freeccb(ccb); if (sb != NULL) sbuf_delete(sb); return (retval); } static int getdevid(struct cam_devitem *item) { int retval = 0; union ccb *ccb = NULL; struct cam_device *dev; dev = cam_open_btl(item->dev_match.path_id, item->dev_match.target_id, item->dev_match.target_lun, O_RDWR, NULL); if (dev == NULL) { warnx("%s", cam_errbuf); retval = 1; goto bailout; } item->device_id_len = 0; ccb = cam_getccb(dev); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); retval = 1; goto bailout; } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cdai); /* * On the first try, we just probe for the size of the data, and * then allocate that much memory and try again. */ retry: ccb->ccb_h.func_code = XPT_DEV_ADVINFO; ccb->ccb_h.flags = CAM_DIR_IN; ccb->cdai.flags = CDAI_FLAG_NONE; ccb->cdai.buftype = CDAI_TYPE_SCSI_DEVID; ccb->cdai.bufsiz = item->device_id_len; if (item->device_id_len != 0) ccb->cdai.buf = (uint8_t *)item->device_id; if (cam_send_ccb(dev, ccb) < 0) { warn("%s: error sending XPT_GDEV_ADVINFO CCB", __func__); retval = 1; goto bailout; } if (ccb->ccb_h.status != CAM_REQ_CMP) { warnx("%s: CAM status %#x", __func__, ccb->ccb_h.status); retval = 1; goto bailout; } if (item->device_id_len == 0) { /* * This is our first time through. Allocate the buffer, * and then go back to get the data. */ if (ccb->cdai.provsiz == 0) { warnx("%s: invalid .provsiz field returned with " "XPT_GDEV_ADVINFO CCB", __func__); retval = 1; goto bailout; } item->device_id_len = ccb->cdai.provsiz; item->device_id = malloc(item->device_id_len); if (item->device_id == NULL) { warn("%s: unable to allocate %d bytes", __func__, item->device_id_len); retval = 1; goto bailout; } ccb->ccb_h.status = CAM_REQ_INPROG; goto retry; } bailout: if (dev != NULL) cam_close_device(dev); if (ccb != NULL) cam_freeccb(ccb); return (retval); } /* * XXX KDM merge this code with getdevtree()? */ static int buildbusdevlist(struct cam_devlist *devlist) { union ccb ccb; int bufsize, fd = -1; struct dev_match_pattern *patterns; struct cam_devitem *item = NULL; int skip_device = 0; int retval = 0; if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) { warn("couldn't open %s", XPT_DEVICE); return (1); } bzero(&ccb, sizeof(union ccb)); ccb.ccb_h.path_id = CAM_XPT_PATH_ID; ccb.ccb_h.target_id = CAM_TARGET_WILDCARD; ccb.ccb_h.target_lun = CAM_LUN_WILDCARD; ccb.ccb_h.func_code = XPT_DEV_MATCH; bufsize = sizeof(struct dev_match_result) * 100; ccb.cdm.match_buf_len = bufsize; ccb.cdm.matches = (struct dev_match_result *)malloc(bufsize); if (ccb.cdm.matches == NULL) { warnx("can't malloc memory for matches"); close(fd); return (1); } ccb.cdm.num_matches = 0; ccb.cdm.num_patterns = 2; ccb.cdm.pattern_buf_len = sizeof(struct dev_match_pattern) * ccb.cdm.num_patterns; patterns = (struct dev_match_pattern *)malloc(ccb.cdm.pattern_buf_len); if (patterns == NULL) { warnx("can't malloc memory for patterns"); retval = 1; goto bailout; } ccb.cdm.patterns = patterns; bzero(patterns, ccb.cdm.pattern_buf_len); patterns[0].type = DEV_MATCH_DEVICE; patterns[0].pattern.device_pattern.flags = DEV_MATCH_PATH; patterns[0].pattern.device_pattern.path_id = devlist->path_id; patterns[1].type = DEV_MATCH_PERIPH; patterns[1].pattern.periph_pattern.flags = PERIPH_MATCH_PATH; patterns[1].pattern.periph_pattern.path_id = devlist->path_id; /* * We do the ioctl multiple times if necessary, in case there are * more than 100 nodes in the EDT. */ do { unsigned int i; if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) { warn("error sending CAMIOCOMMAND ioctl"); retval = 1; goto bailout; } if ((ccb.ccb_h.status != CAM_REQ_CMP) || ((ccb.cdm.status != CAM_DEV_MATCH_LAST) && (ccb.cdm.status != CAM_DEV_MATCH_MORE))) { warnx("got CAM error %#x, CDM error %d\n", ccb.ccb_h.status, ccb.cdm.status); retval = 1; goto bailout; } for (i = 0; i < ccb.cdm.num_matches; i++) { switch (ccb.cdm.matches[i].type) { case DEV_MATCH_DEVICE: { struct device_match_result *dev_result; dev_result = &ccb.cdm.matches[i].result.device_result; if (dev_result->flags & DEV_RESULT_UNCONFIGURED) { skip_device = 1; break; } else skip_device = 0; item = malloc(sizeof(*item)); if (item == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*item)); retval = 1; goto bailout; } bzero(item, sizeof(*item)); bcopy(dev_result, &item->dev_match, sizeof(*dev_result)); STAILQ_INSERT_TAIL(&devlist->dev_queue, item, links); if (getdevid(item) != 0) { retval = 1; goto bailout; } break; } case DEV_MATCH_PERIPH: { struct periph_match_result *periph_result; periph_result = &ccb.cdm.matches[i].result.periph_result; if (skip_device != 0) break; item->num_periphs++; item->periph_matches = realloc( item->periph_matches, item->num_periphs * sizeof(struct periph_match_result)); if (item->periph_matches == NULL) { warn("%s: error allocating periph " "list", __func__); retval = 1; goto bailout; } bcopy(periph_result, &item->periph_matches[ item->num_periphs - 1], sizeof(*periph_result)); break; } default: fprintf(stderr, "%s: unexpected match " "type %d\n", __func__, ccb.cdm.matches[i].type); retval = 1; goto bailout; break; /*NOTREACHED*/ } } } while ((ccb.ccb_h.status == CAM_REQ_CMP) && (ccb.cdm.status == CAM_DEV_MATCH_MORE)); bailout: if (fd != -1) close(fd); free(patterns); free(ccb.cdm.matches); if (retval != 0) freebusdevlist(devlist); return (retval); } static void freebusdevlist(struct cam_devlist *devlist) { struct cam_devitem *item, *item2; STAILQ_FOREACH_SAFE(item, &devlist->dev_queue, links, item2) { STAILQ_REMOVE(&devlist->dev_queue, item, cam_devitem, links); free(item->device_id); free(item->periph_matches); free(item); } } static struct cam_devitem * findsasdevice(struct cam_devlist *devlist, uint64_t sasaddr) { struct cam_devitem *item; STAILQ_FOREACH(item, &devlist->dev_queue, links) { struct scsi_vpd_id_descriptor *idd; /* * XXX KDM look for LUN IDs as well? */ idd = scsi_get_devid(item->device_id, item->device_id_len, scsi_devid_is_sas_target); if (idd == NULL) continue; if (scsi_8btou64(idd->identifier) == sasaddr) return (item); } return (NULL); } static int smpphylist(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { struct smp_report_general_request *rgrequest = NULL; struct smp_report_general_response *rgresponse = NULL; struct smp_discover_request *disrequest = NULL; struct smp_discover_response *disresponse = NULL; struct cam_devlist devlist; union ccb *ccb; int long_response = 0; int num_phys = 0; int quiet = 0; int retval; int i, c; /* * Note that at the moment we don't support sending SMP CCBs to * devices that aren't probed by CAM. */ ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); STAILQ_INIT(&devlist.dev_queue); rgrequest = malloc(sizeof(*rgrequest)); if (rgrequest == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*rgrequest)); retval = 1; goto bailout; } rgresponse = malloc(sizeof(*rgresponse)); if (rgresponse == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*rgresponse)); retval = 1; goto bailout; } while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'l': long_response = 1; break; case 'q': quiet = 1; break; default: break; } } smp_report_general(&ccb->smpio, retry_count, /*cbfcnp*/ NULL, rgrequest, /*request_len*/ sizeof(*rgrequest), (uint8_t *)rgresponse, /*response_len*/ sizeof(*rgresponse), /*long_response*/ long_response, timeout); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (((retval = cam_send_ccb(device, ccb)) < 0) || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto bailout; } num_phys = rgresponse->num_phys; if (num_phys == 0) { if (quiet == 0) fprintf(stdout, "%s: No Phys reported\n", __func__); retval = 1; goto bailout; } devlist.path_id = device->path_id; retval = buildbusdevlist(&devlist); if (retval != 0) goto bailout; if (quiet == 0) { fprintf(stdout, "%d PHYs:\n", num_phys); fprintf(stdout, "PHY Attached SAS Address\n"); } disrequest = malloc(sizeof(*disrequest)); if (disrequest == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*disrequest)); retval = 1; goto bailout; } disresponse = malloc(sizeof(*disresponse)); if (disresponse == NULL) { warn("%s: unable to allocate %zd bytes", __func__, sizeof(*disresponse)); retval = 1; goto bailout; } for (i = 0; i < num_phys; i++) { struct cam_devitem *item; struct device_match_result *dev_match; char vendor[16], product[48], revision[16]; char tmpstr[256]; int j; CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio); ccb->ccb_h.status = CAM_REQ_INPROG; ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; smp_discover(&ccb->smpio, retry_count, /*cbfcnp*/ NULL, disrequest, sizeof(*disrequest), (uint8_t *)disresponse, sizeof(*disresponse), long_response, /*ignore_zone_group*/ 0, /*phy*/ i, timeout); if (((retval = cam_send_ccb(device, ccb)) < 0) || (((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) && (disresponse->function_result != SMP_FR_PHY_VACANT))) { const char warnstr[] = "error sending command"; if (retval < 0) warn(warnstr); else warnx(warnstr); if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } retval = 1; goto bailout; } if (disresponse->function_result == SMP_FR_PHY_VACANT) { if (quiet == 0) fprintf(stdout, "%3d \n", i); continue; } if (disresponse->attached_device == SMP_DIS_AD_TYPE_NONE) { item = NULL; } else { item = findsasdevice(&devlist, scsi_8btou64(disresponse->attached_sas_address)); } if ((quiet == 0) || (item != NULL)) { fprintf(stdout, "%3d 0x%016jx", i, (uintmax_t)scsi_8btou64( disresponse->attached_sas_address)); if (item == NULL) { fprintf(stdout, "\n"); continue; } } else if (quiet != 0) continue; dev_match = &item->dev_match; if (dev_match->protocol == PROTO_SCSI) { cam_strvis(vendor, dev_match->inq_data.vendor, sizeof(dev_match->inq_data.vendor), sizeof(vendor)); cam_strvis(product, dev_match->inq_data.product, sizeof(dev_match->inq_data.product), sizeof(product)); cam_strvis(revision, dev_match->inq_data.revision, sizeof(dev_match->inq_data.revision), sizeof(revision)); sprintf(tmpstr, "<%s %s %s>", vendor, product, revision); } else if ((dev_match->protocol == PROTO_ATA) || (dev_match->protocol == PROTO_SATAPM)) { cam_strvis(product, dev_match->ident_data.model, sizeof(dev_match->ident_data.model), sizeof(product)); cam_strvis(revision, dev_match->ident_data.revision, sizeof(dev_match->ident_data.revision), sizeof(revision)); sprintf(tmpstr, "<%s %s>", product, revision); } else { sprintf(tmpstr, "<>"); } fprintf(stdout, " %-33s ", tmpstr); /* * If we have 0 periphs, that's a bug... */ if (item->num_periphs == 0) { fprintf(stdout, "\n"); continue; } fprintf(stdout, "("); for (j = 0; j < item->num_periphs; j++) { if (j > 0) fprintf(stdout, ","); fprintf(stdout, "%s%d", item->periph_matches[j].periph_name, item->periph_matches[j].unit_number); } fprintf(stdout, ")\n"); } bailout: if (ccb != NULL) cam_freeccb(ccb); free(rgrequest); free(rgresponse); free(disrequest); free(disresponse); freebusdevlist(&devlist); return (retval); } static int atapm_proc_resp(struct cam_device *device, union ccb *ccb) { struct ata_res *res; res = &ccb->ataio.res; if (res->status & ATA_STATUS_ERROR) { if (arglist & CAM_ARG_VERBOSE) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); printf("error = 0x%02x, sector_count = 0x%04x, " "device = 0x%02x, status = 0x%02x\n", res->error, res->sector_count, res->device, res->status); } return (1); } if (arglist & CAM_ARG_VERBOSE) { fprintf(stdout, "%s%d: Raw native check power data:\n", device->device_name, device->dev_unit_num); /* res is 4 byte aligned */ dump_data((uint16_t*)(uintptr_t)res, sizeof(struct ata_res)); printf("error = 0x%02x, sector_count = 0x%04x, device = 0x%02x, " "status = 0x%02x\n", res->error, res->sector_count, res->device, res->status); } printf("%s%d: ", device->device_name, device->dev_unit_num); switch (res->sector_count) { case 0x00: printf("Standby mode\n"); break; case 0x40: printf("NV Cache Power Mode and the spindle is spun down or spinning down\n"); break; case 0x41: printf("NV Cache Power Mode and the spindle is spun up or spinning up\n"); break; case 0x80: printf("Idle mode\n"); break; case 0xff: printf("Active or Idle mode\n"); break; default: printf("Unknown mode 0x%02x\n", res->sector_count); break; } return (0); } static int atapm(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { union ccb *ccb; int retval = 0; int t = -1; int c; u_int8_t ata_flags = 0; u_char cmd, sc; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); return (1); } while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 't': t = atoi(optarg); break; default: break; } } if (strcmp(argv[1], "idle") == 0) { if (t == -1) cmd = ATA_IDLE_IMMEDIATE; else cmd = ATA_IDLE_CMD; } else if (strcmp(argv[1], "standby") == 0) { if (t == -1) cmd = ATA_STANDBY_IMMEDIATE; else cmd = ATA_STANDBY_CMD; } else if (strcmp(argv[1], "powermode") == 0) { cmd = ATA_CHECK_POWER_MODE; ata_flags = AP_FLAG_CHK_COND; t = -1; } else { cmd = ATA_SLEEP; t = -1; } if (t < 0) sc = 0; else if (t <= (240 * 5)) sc = (t + 4) / 5; else if (t <= (252 * 5)) /* special encoding for 21 minutes */ sc = 252; else if (t <= (11 * 30 * 60)) sc = (t - 1) / (30 * 60) + 241; else sc = 253; retval = ata_do_cmd(device, ccb, /*retries*/retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA, /*ata_flags*/ata_flags, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/cmd, /*features*/0, /*lba*/0, /*sector_count*/sc, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/timeout ? timeout : 30 * 1000, /*quiet*/1); cam_freeccb(ccb); if (retval || cmd != ATA_CHECK_POWER_MODE) return (retval); return (atapm_proc_resp(device, ccb)); } static int ataaxm(struct cam_device *device, int argc, char **argv, char *combinedopt, int retry_count, int timeout) { union ccb *ccb; int retval = 0; int l = -1; int c; u_char cmd, sc; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); return (1); } while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'l': l = atoi(optarg); break; default: break; } } sc = 0; if (strcmp(argv[1], "apm") == 0) { if (l == -1) cmd = 0x85; else { cmd = 0x05; sc = l; } } else /* aam */ { if (l == -1) cmd = 0xC2; else { cmd = 0x42; sc = l; } } retval = ata_do_28bit_cmd(device, ccb, /*retries*/retry_count, /*flags*/CAM_DIR_NONE, /*protocol*/AP_PROTO_NON_DATA, /*tag_action*/MSG_SIMPLE_Q_TAG, /*command*/ATA_SETFEATURES, /*features*/cmd, /*lba*/0, /*sector_count*/sc, /*data_ptr*/NULL, /*dxfer_len*/0, /*timeout*/timeout ? timeout : 30 * 1000, /*quiet*/1); cam_freeccb(ccb); return (retval); } int scsigetopcodes(struct cam_device *device, int opcode_set, int opcode, int show_sa_errors, int sa_set, int service_action, int timeout_desc, int task_attr, int retry_count, int timeout, int verbosemode, uint32_t *fill_len, uint8_t **data_ptr) { union ccb *ccb = NULL; uint8_t *buf = NULL; uint32_t alloc_len = 0, num_opcodes; uint32_t valid_len = 0; uint32_t avail_len = 0; struct scsi_report_supported_opcodes_all *all_hdr; struct scsi_report_supported_opcodes_one *one; int options = 0; int retval = 0; /* * Make it clear that we haven't yet allocated or filled anything. */ *fill_len = 0; *data_ptr = NULL; ccb = cam_getccb(device); if (ccb == NULL) { warnx("couldn't allocate CCB"); retval = 1; goto bailout; } /* cam_getccb cleans up the header, caller has to zero the payload */ CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); if (opcode_set != 0) { options |= RSO_OPTIONS_OC; num_opcodes = 1; alloc_len = sizeof(*one) + CAM_MAX_CDBLEN; } else { num_opcodes = 256; alloc_len = sizeof(*all_hdr) + (num_opcodes * sizeof(struct scsi_report_supported_opcodes_descr)); } if (timeout_desc != 0) { options |= RSO_RCTD; alloc_len += num_opcodes * sizeof(struct scsi_report_supported_opcodes_timeout); } if (sa_set != 0) { options |= RSO_OPTIONS_OC_SA; if (show_sa_errors != 0) options &= ~RSO_OPTIONS_OC; } retry_alloc: if (buf != NULL) { free(buf); buf = NULL; } buf = malloc(alloc_len); if (buf == NULL) { warn("Unable to allocate %u bytes", alloc_len); retval = 1; goto bailout; } bzero(buf, alloc_len); scsi_report_supported_opcodes(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*options*/ options, /*req_opcode*/ opcode, /*req_service_action*/ service_action, /*data_ptr*/ buf, /*dxfer_len*/ alloc_len, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 10000); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count != 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(device, ccb) < 0) { perror("error sending REPORT SUPPORTED OPERATION CODES"); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (verbosemode != 0) cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } valid_len = ccb->csio.dxfer_len - ccb->csio.resid; if (((options & RSO_OPTIONS_MASK) == RSO_OPTIONS_ALL) && (valid_len >= sizeof(*all_hdr))) { all_hdr = (struct scsi_report_supported_opcodes_all *)buf; avail_len = scsi_4btoul(all_hdr->length) + sizeof(*all_hdr); } else if (((options & RSO_OPTIONS_MASK) != RSO_OPTIONS_ALL) && (valid_len >= sizeof(*one))) { uint32_t cdb_length; one = (struct scsi_report_supported_opcodes_one *)buf; cdb_length = scsi_2btoul(one->cdb_length); avail_len = sizeof(*one) + cdb_length; if (one->support & RSO_ONE_CTDP) { struct scsi_report_supported_opcodes_timeout *td; td = (struct scsi_report_supported_opcodes_timeout *) &buf[avail_len]; if (valid_len >= (avail_len + sizeof(td->length))) { avail_len += scsi_2btoul(td->length) + sizeof(td->length); } else { avail_len += sizeof(*td); } } } /* * avail_len could be zero if we didn't get enough data back from * thet target to determine */ if ((avail_len != 0) && (avail_len > valid_len)) { alloc_len = avail_len; goto retry_alloc; } *fill_len = valid_len; *data_ptr = buf; bailout: if (retval != 0) free(buf); cam_freeccb(ccb); return (retval); } static int scsiprintoneopcode(struct cam_device *device, int req_opcode, int sa_set, int req_sa, uint8_t *buf, uint32_t valid_len) { struct scsi_report_supported_opcodes_one *one; struct scsi_report_supported_opcodes_timeout *td; uint32_t cdb_len = 0, td_len = 0; const char *op_desc = NULL; unsigned int i; int retval = 0; one = (struct scsi_report_supported_opcodes_one *)buf; /* * If we don't have the full single opcode descriptor, no point in * continuing. */ if (valid_len < __offsetof(struct scsi_report_supported_opcodes_one, cdb_length)) { warnx("Only %u bytes returned, not enough to verify support", valid_len); retval = 1; goto bailout; } op_desc = scsi_op_desc(req_opcode, &device->inq_data); printf("%s (0x%02x)", op_desc != NULL ? op_desc : "UNKNOWN", req_opcode); if (sa_set != 0) printf(", SA 0x%x", req_sa); printf(": "); switch (one->support & RSO_ONE_SUP_MASK) { case RSO_ONE_SUP_UNAVAIL: printf("No command support information currently available\n"); break; case RSO_ONE_SUP_NOT_SUP: printf("Command not supported\n"); retval = 1; goto bailout; break; /*NOTREACHED*/ case RSO_ONE_SUP_AVAIL: printf("Command is supported, complies with a SCSI standard\n"); break; case RSO_ONE_SUP_VENDOR: printf("Command is supported, vendor-specific " "implementation\n"); break; default: printf("Unknown command support flags 0x%#x\n", one->support & RSO_ONE_SUP_MASK); break; } /* * If we don't have the CDB length, it isn't exactly an error, the * command probably isn't supported. */ if (valid_len < __offsetof(struct scsi_report_supported_opcodes_one, cdb_usage)) goto bailout; cdb_len = scsi_2btoul(one->cdb_length); /* * If our valid data doesn't include the full reported length, * return. The caller should have detected this and adjusted his * allocation length to get all of the available data. */ if (valid_len < sizeof(*one) + cdb_len) { retval = 1; goto bailout; } /* * If all we have is the opcode, there is no point in printing out * the usage bitmap. */ if (cdb_len <= 1) { retval = 1; goto bailout; } printf("CDB usage bitmap:"); for (i = 0; i < cdb_len; i++) { printf(" %02x", one->cdb_usage[i]); } printf("\n"); /* * If we don't have a timeout descriptor, we're done. */ if ((one->support & RSO_ONE_CTDP) == 0) goto bailout; /* * If we don't have enough valid length to include the timeout * descriptor length, we're done. */ if (valid_len < (sizeof(*one) + cdb_len + sizeof(td->length))) goto bailout; td = (struct scsi_report_supported_opcodes_timeout *) &buf[sizeof(*one) + cdb_len]; td_len = scsi_2btoul(td->length); td_len += sizeof(td->length); /* * If we don't have the full timeout descriptor, we're done. */ if (td_len < sizeof(*td)) goto bailout; /* * If we don't have enough valid length to contain the full timeout * descriptor, we're done. */ if (valid_len < (sizeof(*one) + cdb_len + td_len)) goto bailout; printf("Timeout information:\n"); printf("Command-specific: 0x%02x\n", td->cmd_specific); printf("Nominal timeout: %u seconds\n", scsi_4btoul(td->nominal_time)); printf("Recommended timeout: %u seconds\n", scsi_4btoul(td->recommended_time)); bailout: return (retval); } static int scsiprintopcodes(struct cam_device *device, int td_req, uint8_t *buf, uint32_t valid_len) { struct scsi_report_supported_opcodes_all *hdr; struct scsi_report_supported_opcodes_descr *desc; uint32_t avail_len = 0, used_len = 0; uint8_t *cur_ptr; int retval = 0; if (valid_len < sizeof(*hdr)) { warnx("%s: not enough returned data (%u bytes) opcode list", __func__, valid_len); retval = 1; goto bailout; } hdr = (struct scsi_report_supported_opcodes_all *)buf; avail_len = scsi_4btoul(hdr->length); avail_len += sizeof(hdr->length); /* * Take the lesser of the amount of data the drive claims is * available, and the amount of data the HBA says was returned. */ avail_len = MIN(avail_len, valid_len); used_len = sizeof(hdr->length); printf("%-6s %4s %8s ", "Opcode", "SA", "CDB len" ); if (td_req != 0) printf("%5s %6s %6s ", "CS", "Nom", "Rec"); printf(" Description\n"); while ((avail_len - used_len) > sizeof(*desc)) { struct scsi_report_supported_opcodes_timeout *td; uint32_t td_len; const char *op_desc = NULL; cur_ptr = &buf[used_len]; desc = (struct scsi_report_supported_opcodes_descr *)cur_ptr; op_desc = scsi_op_desc(desc->opcode, &device->inq_data); if (op_desc == NULL) op_desc = "UNKNOWN"; printf("0x%02x %#4x %8u ", desc->opcode, scsi_2btoul(desc->service_action), scsi_2btoul(desc->cdb_length)); used_len += sizeof(*desc); if ((desc->flags & RSO_CTDP) == 0) { printf(" %s\n", op_desc); continue; } /* * If we don't have enough space to fit a timeout * descriptor, then we're done. */ if (avail_len - used_len < sizeof(*td)) { used_len = avail_len; printf(" %s\n", op_desc); continue; } cur_ptr = &buf[used_len]; td = (struct scsi_report_supported_opcodes_timeout *)cur_ptr; td_len = scsi_2btoul(td->length); td_len += sizeof(td->length); used_len += td_len; /* * If the given timeout descriptor length is less than what * we understand, skip it. */ if (td_len < sizeof(*td)) { printf(" %s\n", op_desc); continue; } printf(" 0x%02x %6u %6u %s\n", td->cmd_specific, scsi_4btoul(td->nominal_time), scsi_4btoul(td->recommended_time), op_desc); } bailout: return (retval); } static int scsiopcodes(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout, int verbosemode) { int c; uint32_t opcode = 0, service_action = 0; int td_set = 0, opcode_set = 0, sa_set = 0; int show_sa_errors = 1; uint32_t valid_len = 0; uint8_t *buf = NULL; char *endptr; int retval = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'N': show_sa_errors = 0; break; case 'o': opcode = strtoul(optarg, &endptr, 0); if (*endptr != '\0') { warnx("Invalid opcode \"%s\", must be a number", optarg); retval = 1; goto bailout; } if (opcode > 0xff) { warnx("Invalid opcode 0x%#x, must be between" "0 and 0xff inclusive", opcode); retval = 1; goto bailout; } opcode_set = 1; break; case 's': service_action = strtoul(optarg, &endptr, 0); if (*endptr != '\0') { warnx("Invalid service action \"%s\", must " "be a number", optarg); retval = 1; goto bailout; } if (service_action > 0xffff) { warnx("Invalid service action 0x%#x, must " "be between 0 and 0xffff inclusive", service_action); retval = 1; } sa_set = 1; break; case 'T': td_set = 1; break; default: break; } } if ((sa_set != 0) && (opcode_set == 0)) { warnx("You must specify an opcode with -o if a service " "action is given"); retval = 1; goto bailout; } retval = scsigetopcodes(device, opcode_set, opcode, show_sa_errors, sa_set, service_action, td_set, task_attr, retry_count, timeout, verbosemode, &valid_len, &buf); if (retval != 0) goto bailout; if ((opcode_set != 0) || (sa_set != 0)) { retval = scsiprintoneopcode(device, opcode, sa_set, service_action, buf, valid_len); } else { retval = scsiprintopcodes(device, td_set, buf, valid_len); } bailout: free(buf); return (retval); } static int reprobe(struct cam_device *device) { union ccb *ccb; int retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); return (1); } CCB_CLEAR_ALL_EXCEPT_HDR(ccb); ccb->ccb_h.func_code = XPT_REPROBE_LUN; if (cam_send_ccb(device, ccb) < 0) { warn("error sending XPT_REPROBE_LUN CCB"); retval = 1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = 1; goto bailout; } bailout: cam_freeccb(ccb); return (retval); } void usage(int printlong) { fprintf(printlong ? stdout : stderr, "usage: camcontrol [device id][generic args][command args]\n" " camcontrol devlist [-b] [-v]\n" " camcontrol periphlist [dev_id][-n dev_name] [-u unit]\n" " camcontrol tur [dev_id][generic args]\n" " camcontrol inquiry [dev_id][generic args] [-D] [-S] [-R]\n" " camcontrol identify [dev_id][generic args] [-v]\n" " camcontrol reportluns [dev_id][generic args] [-c] [-l] [-r report]\n" " camcontrol readcap [dev_id][generic args] [-b] [-h] [-H] [-N]\n" " [-q] [-s] [-l]\n" " camcontrol start [dev_id][generic args]\n" " camcontrol stop [dev_id][generic args]\n" " camcontrol load [dev_id][generic args]\n" " camcontrol eject [dev_id][generic args]\n" " camcontrol reprobe [dev_id][generic args]\n" " camcontrol rescan \n" " camcontrol reset \n" " camcontrol defects [dev_id][generic args] <-f format> [-P][-G]\n" " [-q][-s][-S offset][-X]\n" " camcontrol modepage [dev_id][generic args] <-m page | -l>\n" " [-P pagectl][-e | -b][-d]\n" " camcontrol cmd [dev_id][generic args]\n" " <-a cmd [args] | -c cmd [args]>\n" " [-d] [-f] [-i len fmt|-o len fmt [args]] [-r fmt]\n" " camcontrol smpcmd [dev_id][generic args]\n" " <-r len fmt [args]> <-R len fmt [args]>\n" " camcontrol smprg [dev_id][generic args][-l]\n" " camcontrol smppc [dev_id][generic args] <-p phy> [-l]\n" " [-o operation][-d name][-m rate][-M rate]\n" " [-T pp_timeout][-a enable|disable]\n" " [-A enable|disable][-s enable|disable]\n" " [-S enable|disable]\n" " camcontrol smpphylist [dev_id][generic args][-l][-q]\n" " camcontrol smpmaninfo [dev_id][generic args][-l]\n" " camcontrol debug [-I][-P][-T][-S][-X][-c]\n" " \n" " camcontrol tags [dev_id][generic args] [-N tags] [-q] [-v]\n" " camcontrol negotiate [dev_id][generic args] [-a][-c]\n" " [-D ][-M mode][-O offset]\n" " [-q][-R syncrate][-v][-T ]\n" " [-U][-W bus_width]\n" " camcontrol format [dev_id][generic args][-q][-r][-w][-y]\n" " camcontrol sanitize [dev_id][generic args]\n" " [-a overwrite|block|crypto|exitfailure]\n" " [-c passes][-I][-P pattern][-q][-U][-r][-w]\n" " [-y]\n" " camcontrol idle [dev_id][generic args][-t time]\n" " camcontrol standby [dev_id][generic args][-t time]\n" " camcontrol sleep [dev_id][generic args]\n" " camcontrol powermode [dev_id][generic args]\n" " camcontrol apm [dev_id][generic args][-l level]\n" " camcontrol aam [dev_id][generic args][-l level]\n" " camcontrol fwdownload [dev_id][generic args] <-f fw_image> [-q]\n" " [-s][-y]\n" " camcontrol security [dev_id][generic args]\n" " <-d pwd | -e pwd | -f | -h pwd | -k pwd>\n" " [-l ] [-q] [-s pwd] [-T timeout]\n" " [-U ] [-y]\n" " camcontrol hpa [dev_id][generic args] [-f] [-l] [-P] [-p pwd]\n" " [-q] [-s max_sectors] [-U pwd] [-y]\n" " camcontrol ama [dev_id][generic args] [-f] [-q] [-s max_sectors]\n" " camcontrol persist [dev_id][generic args] <-i action|-o action>\n" " [-a][-I tid][-k key][-K sa_key][-p][-R rtp]\n" " [-s scope][-S][-T type][-U]\n" " camcontrol attrib [dev_id][generic args] <-r action|-w attr>\n" " [-a attr_num][-c][-e elem][-F form1,form1]\n" " [-p part][-s start][-T type][-V vol]\n" " camcontrol opcodes [dev_id][generic args][-o opcode][-s SA]\n" " [-N][-T]\n" " camcontrol zone [dev_id][generic args]<-c cmd> [-a] [-l LBA]\n" " [-o rep_opts] [-P print_opts]\n" " camcontrol epc [dev_id][generic_args]<-c cmd> [-d] [-D] [-e]\n" " [-H] [-p power_cond] [-P] [-r rst_src] [-s]\n" " [-S power_src] [-T timer]\n" " camcontrol timestamp [dev_id][generic_args] <-r [-f format|-m|-U]>|\n" " <-s <-f format -T time | -U >>\n" " camcontrol devtype [dev_id]\n" " \n" " camcontrol help\n"); if (!printlong) return; fprintf(stdout, "Specify one of the following options:\n" "devlist list all CAM devices\n" "periphlist list all CAM peripheral drivers attached to a device\n" "tur send a test unit ready to the named device\n" "inquiry send a SCSI inquiry command to the named device\n" "identify send a ATA identify command to the named device\n" "reportluns send a SCSI report luns command to the device\n" "readcap send a SCSI read capacity command to the device\n" "start send a Start Unit command to the device\n" "stop send a Stop Unit command to the device\n" "load send a Start Unit command to the device with the load bit set\n" "eject send a Stop Unit command to the device with the eject bit set\n" "reprobe update capacity information of the given device\n" "rescan rescan all buses, the given bus, bus:target:lun or device\n" "reset reset all buses, the given bus, bus:target:lun or device\n" "defects read the defect list of the specified device\n" "modepage display or edit (-e) the given mode page\n" "cmd send the given SCSI command, may need -i or -o as well\n" "smpcmd send the given SMP command, requires -o and -i\n" "smprg send the SMP Report General command\n" "smppc send the SMP PHY Control command, requires -p\n" "smpphylist display phys attached to a SAS expander\n" "smpmaninfo send the SMP Report Manufacturer Info command\n" "debug turn debugging on/off for a bus, target, or lun, or all devices\n" "tags report or set the number of transaction slots for a device\n" "negotiate report or set device negotiation parameters\n" "format send the SCSI FORMAT UNIT command to the named device\n" "sanitize send the SCSI SANITIZE command to the named device\n" "idle send the ATA IDLE command to the named device\n" "standby send the ATA STANDBY command to the named device\n" "sleep send the ATA SLEEP command to the named device\n" "powermode send the ATA CHECK POWER MODE command to the named device\n" "fwdownload program firmware of the named device with the given image\n" "security report or send ATA security commands to the named device\n" "persist send the SCSI PERSISTENT RESERVE IN or OUT commands\n" "attrib send the SCSI READ or WRITE ATTRIBUTE commands\n" "opcodes send the SCSI REPORT SUPPORTED OPCODES command\n" "zone manage Zoned Block (Shingled) devices\n" "epc send ATA Extended Power Conditions commands\n" "timestamp report or set the device's timestamp\n" "devtype report the type of device\n" "help this message\n" "Device Identifiers:\n" "bus:target specify the bus and target, lun defaults to 0\n" "bus:target:lun specify the bus, target and lun\n" "deviceUNIT specify the device name, like \"da4\" or \"cd2\"\n" "Generic arguments:\n" "-v be verbose, print out sense information\n" "-t timeout command timeout in seconds, overrides default timeout\n" "-n dev_name specify device name, e.g. \"da\", \"cd\"\n" "-u unit specify unit number, e.g. \"0\", \"5\"\n" "-E have the kernel attempt to perform SCSI error recovery\n" "-C count specify the SCSI command retry count (needs -E to work)\n" "-Q task_attr specify ordered, simple or head tag type for SCSI cmds\n" "modepage arguments:\n" "-l list all available mode pages\n" "-m page specify the mode page to view or edit\n" "-e edit the specified mode page\n" "-b force view to binary mode\n" "-d disable block descriptors for mode sense\n" "-P pgctl page control field 0-3\n" "defects arguments:\n" "-f format specify defect list format (block, bfi or phys)\n" "-G get the grown defect list\n" "-P get the permanent defect list\n" "inquiry arguments:\n" "-D get the standard inquiry data\n" "-S get the serial number\n" "-R get the transfer rate, etc.\n" "reportluns arguments:\n" "-c only report a count of available LUNs\n" "-l only print out luns, and not a count\n" "-r specify \"default\", \"wellknown\" or \"all\"\n" "readcap arguments\n" "-b only report the blocksize\n" "-h human readable device size, base 2\n" "-H human readable device size, base 10\n" "-N print the number of blocks instead of last block\n" "-q quiet, print numbers only\n" "-s only report the last block/device size\n" "cmd arguments:\n" "-c cdb [args] specify the SCSI CDB\n" "-i len fmt specify input data and input data format\n" "-o len fmt [args] specify output data and output data fmt\n" "smpcmd arguments:\n" "-r len fmt [args] specify the SMP command to be sent\n" "-R len fmt [args] specify SMP response format\n" "smprg arguments:\n" "-l specify the long response format\n" "smppc arguments:\n" "-p phy specify the PHY to operate on\n" "-l specify the long request/response format\n" "-o operation specify the phy control operation\n" "-d name set the attached device name\n" "-m rate set the minimum physical link rate\n" "-M rate set the maximum physical link rate\n" "-T pp_timeout set the partial pathway timeout value\n" "-a enable|disable enable or disable SATA slumber\n" "-A enable|disable enable or disable SATA partial phy power\n" "-s enable|disable enable or disable SAS slumber\n" "-S enable|disable enable or disable SAS partial phy power\n" "smpphylist arguments:\n" "-l specify the long response format\n" "-q only print phys with attached devices\n" "smpmaninfo arguments:\n" "-l specify the long response format\n" "debug arguments:\n" "-I CAM_DEBUG_INFO -- scsi commands, errors, data\n" "-T CAM_DEBUG_TRACE -- routine flow tracking\n" "-S CAM_DEBUG_SUBTRACE -- internal routine command flow\n" "-c CAM_DEBUG_CDB -- print out SCSI CDBs only\n" "tags arguments:\n" "-N tags specify the number of tags to use for this device\n" "-q be quiet, don't report the number of tags\n" "-v report a number of tag-related parameters\n" "negotiate arguments:\n" "-a send a test unit ready after negotiation\n" "-c report/set current negotiation settings\n" "-D \"enable\" or \"disable\" disconnection\n" "-M mode set ATA mode\n" "-O offset set command delay offset\n" "-q be quiet, don't report anything\n" "-R syncrate synchronization rate in MHz\n" "-T \"enable\" or \"disable\" tagged queueing\n" "-U report/set user negotiation settings\n" "-W bus_width set the bus width in bits (8, 16 or 32)\n" "-v also print a Path Inquiry CCB for the controller\n" "format arguments:\n" "-q be quiet, don't print status messages\n" "-r run in report only mode\n" "-w don't send immediate format command\n" "-y don't ask any questions\n" "sanitize arguments:\n" "-a operation operation mode: overwrite, block, crypto or exitfailure\n" "-c passes overwrite passes to perform (1 to 31)\n" "-I invert overwrite pattern after each pass\n" "-P pattern path to overwrite pattern file\n" "-q be quiet, don't print status messages\n" "-r run in report only mode\n" "-U run operation in unrestricted completion exit mode\n" "-w don't send immediate sanitize command\n" "-y don't ask any questions\n" "idle/standby arguments:\n" "-t number of seconds before respective state.\n" "fwdownload arguments:\n" "-f fw_image path to firmware image file\n" "-q don't print informational messages, only errors\n" "-s run in simulation mode\n" "-v print info for every firmware segment sent to device\n" "-y don't ask any questions\n" "security arguments:\n" "-d pwd disable security using the given password for the selected\n" " user\n" "-e pwd erase the device using the given pwd for the selected user\n" "-f freeze the security configuration of the specified device\n" "-h pwd enhanced erase the device using the given pwd for the\n" " selected user\n" "-k pwd unlock the device using the given pwd for the selected\n" " user\n" "-l specifies which security level to set: high or maximum\n" "-q be quiet, do not print any status messages\n" "-s pwd password the device (enable security) using the given\n" " pwd for the selected user\n" "-T timeout overrides the timeout (seconds) used for erase operation\n" "-U specifies which user to set: user or master\n" "-y don't ask any questions\n" "hpa arguments:\n" "-f freeze the HPA configuration of the device\n" "-l lock the HPA configuration of the device\n" "-P make the HPA max sectors persist\n" "-p pwd Set the HPA configuration password required for unlock\n" " calls\n" "-q be quiet, do not print any status messages\n" "-s sectors configures the maximum user accessible sectors of the\n" " device\n" "-U pwd unlock the HPA configuration of the device\n" "-y don't ask any questions\n" "ama arguments:\n" "-f freeze the AMA configuration of the device\n" "-q be quiet, do not print any status messages\n" "-s sectors configures the maximum user accessible sectors of the\n" " device\n" "persist arguments:\n" "-i action specify read_keys, read_reservation, report_cap, or\n" " read_full_status\n" "-o action specify register, register_ignore, reserve, release,\n" " clear, preempt, preempt_abort, register_move, replace_lost\n" "-a set the All Target Ports (ALL_TG_PT) bit\n" "-I tid specify a Transport ID, e.g.: sas,0x1234567812345678\n" "-k key specify the Reservation Key\n" "-K sa_key specify the Service Action Reservation Key\n" "-p set the Activate Persist Through Power Loss bit\n" "-R rtp specify the Relative Target Port\n" "-s scope specify the scope: lun, extent, element or a number\n" "-S specify Transport ID for register, requires -I\n" "-T res_type specify the reservation type: read_shared, wr_ex, rd_ex,\n" " ex_ac, wr_ex_ro, ex_ac_ro, wr_ex_ar, ex_ac_ar\n" "-U unregister the current initiator for register_move\n" "attrib arguments:\n" "-r action specify attr_values, attr_list, lv_list, part_list, or\n" " supp_attr\n" "-w attr specify an attribute to write, one -w argument per attr\n" "-a attr_num only display this attribute number\n" "-c get cached attributes\n" "-e elem_addr request attributes for the given element in a changer\n" "-F form1,form2 output format, comma separated list: text_esc, text_raw,\n" " nonascii_esc, nonascii_trim, nonascii_raw, field_all,\n" " field_none, field_desc, field_num, field_size, field_rw\n" "-p partition request attributes for the given partition\n" "-s start_attr request attributes starting at the given number\n" "-T elem_type specify the element type (used with -e)\n" "-V logical_vol specify the logical volume ID\n" "opcodes arguments:\n" "-o opcode specify the individual opcode to list\n" "-s service_action specify the service action for the opcode\n" "-N do not return SCSI error for unsupported SA\n" "-T request nominal and recommended timeout values\n" "zone arguments:\n" "-c cmd required: rz, open, close, finish, or rwp\n" "-a apply the action to all zones\n" "-l LBA specify the zone starting LBA\n" "-o rep_opts report zones options: all, empty, imp_open, exp_open,\n" " closed, full, ro, offline, reset, nonseq, nonwp\n" "-P print_opt report zones printing: normal, summary, script\n" "epc arguments:\n" "-c cmd required: restore, goto, timer, state, enable, disable,\n" " source, status, list\n" "-d disable power mode (timer, state)\n" "-D delayed entry (goto)\n" "-e enable power mode (timer, state)\n" "-H hold power mode (goto)\n" "-p power_cond Idle_a, Idle_b, Idle_c, Standby_y, Standby_z (timer,\n" " state, goto)\n" "-P only display power mode (status)\n" "-r rst_src restore settings from: default, saved (restore)\n" "-s save mode (timer, state, restore)\n" "-S power_src set power source: battery, nonbattery (source)\n" "-T timer set timer, seconds, .1 sec resolution (timer)\n" "timestamp arguments:\n" "-r report the timestamp of the device\n" "-f format report the timestamp of the device with the given\n" " strftime(3) format string\n" "-m report the timestamp of the device as milliseconds since\n" " January 1st, 1970\n" "-U report the time with UTC instead of the local time zone\n" "-s set the timestamp of the device\n" "-f format the format of the time string passed into strptime(3)\n" "-T time the time value passed into strptime(3)\n" "-U set the timestamp of the device to UTC time\n" ); } int main(int argc, char **argv) { int c; char *device = NULL; int unit = 0; struct cam_device *cam_dev = NULL; int timeout = 0, retry_count = 1; camcontrol_optret optreturn; char *tstr; const char *mainopt = "C:En:Q:t:u:v"; const char *subopt = NULL; char combinedopt[256]; int error = 0, optstart = 2; int task_attr = MSG_SIMPLE_Q_TAG; int devopen = 1; path_id_t bus; target_id_t target; lun_id_t lun; cmdlist = CAM_CMD_NONE; arglist = CAM_ARG_NONE; if (argc < 2) { usage(0); exit(1); } /* * Get the base option. */ optreturn = getoption(option_table,argv[1], &cmdlist, &arglist,&subopt); if (optreturn == CC_OR_AMBIGUOUS) { warnx("ambiguous option %s", argv[1]); usage(0); exit(1); } else if (optreturn == CC_OR_NOT_FOUND) { warnx("option %s not found", argv[1]); usage(0); exit(1); } /* * Ahh, getopt(3) is a pain. * * This is a gross hack. There really aren't many other good * options (excuse the pun) for parsing options in a situation like * this. getopt is kinda braindead, so you end up having to run * through the options twice, and give each invocation of getopt * the option string for the other invocation. * * You would think that you could just have two groups of options. * The first group would get parsed by the first invocation of * getopt, and the second group would get parsed by the second * invocation of getopt. It doesn't quite work out that way. When * the first invocation of getopt finishes, it leaves optind pointing * to the argument _after_ the first argument in the second group. * So when the second invocation of getopt comes around, it doesn't * recognize the first argument it gets and then bails out. * * A nice alternative would be to have a flag for getopt that says * "just keep parsing arguments even when you encounter an unknown * argument", but there isn't one. So there's no real clean way to * easily parse two sets of arguments without having one invocation * of getopt know about the other. * * Without this hack, the first invocation of getopt would work as * long as the generic arguments are first, but the second invocation * (in the subfunction) would fail in one of two ways. In the case * where you don't set optreset, it would fail because optind may be * pointing to the argument after the one it should be pointing at. * In the case where you do set optreset, and reset optind, it would * fail because getopt would run into the first set of options, which * it doesn't understand. * * All of this would "sort of" work if you could somehow figure out * whether optind had been incremented one option too far. The * mechanics of that, however, are more daunting than just giving * both invocations all of the expect options for either invocation. * * Needless to say, I wouldn't mind if someone invented a better * (non-GPL!) command line parsing interface than getopt. I * wouldn't mind if someone added more knobs to getopt to make it * work better. Who knows, I may talk myself into doing it someday, * if the standards weenies let me. As it is, it just leads to * hackery like this and causes people to avoid it in some cases. * * KDM, September 8th, 1998 */ if (subopt != NULL) sprintf(combinedopt, "%s%s", mainopt, subopt); else sprintf(combinedopt, "%s", mainopt); /* * For these options we do not parse optional device arguments and * we do not open a passthrough device. */ if ((cmdlist == CAM_CMD_RESCAN) || (cmdlist == CAM_CMD_RESET) || (cmdlist == CAM_CMD_DEVTREE) || (cmdlist == CAM_CMD_USAGE) || (cmdlist == CAM_CMD_DEBUG)) devopen = 0; if ((devopen == 1) && (argc > 2 && argv[2][0] != '-')) { char name[30]; int rv; if (isdigit(argv[2][0])) { /* device specified as bus:target[:lun] */ rv = parse_btl(argv[2], &bus, &target, &lun, &arglist); if (rv < 2) errx(1, "numeric device specification must " "be either bus:target, or " "bus:target:lun"); /* default to 0 if lun was not specified */ if ((arglist & CAM_ARG_LUN) == 0) { lun = 0; arglist |= CAM_ARG_LUN; } optstart++; } else { if (cam_get_device(argv[2], name, sizeof name, &unit) == -1) errx(1, "%s", cam_errbuf); device = strdup(name); arglist |= CAM_ARG_DEVICE | CAM_ARG_UNIT; optstart++; } } /* * Start getopt processing at argv[2/3], since we've already * accepted argv[1..2] as the command name, and as a possible * device name. */ optind = optstart; /* * Now we run through the argument list looking for generic * options, and ignoring options that possibly belong to * subfunctions. */ while ((c = getopt(argc, argv, combinedopt))!= -1){ switch(c) { case 'C': retry_count = strtol(optarg, NULL, 0); if (retry_count < 0) errx(1, "retry count %d is < 0", retry_count); arglist |= CAM_ARG_RETRIES; break; case 'E': arglist |= CAM_ARG_ERR_RECOVER; break; case 'n': arglist |= CAM_ARG_DEVICE; tstr = optarg; while (isspace(*tstr) && (*tstr != '\0')) tstr++; device = (char *)strdup(tstr); break; case 'Q': { char *endptr; int table_entry = 0; tstr = optarg; while (isspace(*tstr) && (*tstr != '\0')) tstr++; if (isdigit(*tstr)) { task_attr = strtol(tstr, &endptr, 0); if (*endptr != '\0') { errx(1, "Invalid queue option " "%s", tstr); } } else { size_t table_size; scsi_nv_status status; table_size = sizeof(task_attrs) / sizeof(task_attrs[0]); status = scsi_get_nv(task_attrs, table_size, tstr, &table_entry, SCSI_NV_FLAG_IG_CASE); if (status == SCSI_NV_FOUND) task_attr = task_attrs[ table_entry].value; else { errx(1, "%s option %s", (status == SCSI_NV_AMBIGUOUS)? "ambiguous" : "invalid", tstr); } } break; } case 't': timeout = strtol(optarg, NULL, 0); if (timeout < 0) errx(1, "invalid timeout %d", timeout); /* Convert the timeout from seconds to ms */ timeout *= 1000; arglist |= CAM_ARG_TIMEOUT; break; case 'u': arglist |= CAM_ARG_UNIT; unit = strtol(optarg, NULL, 0); break; case 'v': arglist |= CAM_ARG_VERBOSE; break; default: break; } } /* * For most commands we'll want to open the passthrough device * associated with the specified device. In the case of the rescan * commands, we don't use a passthrough device at all, just the * transport layer device. */ if (devopen == 1) { if (((arglist & (CAM_ARG_BUS|CAM_ARG_TARGET)) == 0) && (((arglist & CAM_ARG_DEVICE) == 0) || ((arglist & CAM_ARG_UNIT) == 0))) { errx(1, "subcommand \"%s\" requires a valid device " "identifier", argv[1]); } if ((cam_dev = ((arglist & (CAM_ARG_BUS | CAM_ARG_TARGET))? cam_open_btl(bus, target, lun, O_RDWR, NULL) : cam_open_spec_device(device,unit,O_RDWR,NULL))) == NULL) errx(1,"%s", cam_errbuf); } /* * Reset optind to 2, and reset getopt, so these routines can parse * the arguments again. */ optind = optstart; optreset = 1; switch(cmdlist) { case CAM_CMD_DEVLIST: error = getdevlist(cam_dev); break; case CAM_CMD_HPA: error = atahpa(cam_dev, retry_count, timeout, argc, argv, combinedopt); break; case CAM_CMD_AMA: error = ataama(cam_dev, retry_count, timeout, argc, argv, combinedopt); break; case CAM_CMD_DEVTREE: error = getdevtree(argc, argv, combinedopt); break; case CAM_CMD_DEVTYPE: error = getdevtype(cam_dev); break; case CAM_CMD_TUR: error = testunitready(cam_dev, task_attr, retry_count, timeout, 0); break; case CAM_CMD_INQUIRY: error = scsidoinquiry(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_IDENTIFY: error = identify(cam_dev, retry_count, timeout); break; case CAM_CMD_STARTSTOP: error = scsistart(cam_dev, arglist & CAM_ARG_START_UNIT, arglist & CAM_ARG_EJECT, task_attr, retry_count, timeout); break; case CAM_CMD_RESCAN: error = dorescan_or_reset(argc, argv, 1); break; case CAM_CMD_RESET: error = dorescan_or_reset(argc, argv, 0); break; case CAM_CMD_READ_DEFECTS: error = readdefects(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_MODE_PAGE: modepage(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_SCSI_CMD: error = scsicmd(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_MMCSD_CMD: error = mmcsdcmd(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SMP_CMD: error = smpcmd(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SMP_RG: error = smpreportgeneral(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SMP_PC: error = smpphycontrol(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SMP_PHYLIST: error = smpphylist(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SMP_MANINFO: error = smpmaninfo(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_DEBUG: error = camdebug(argc, argv, combinedopt); break; case CAM_CMD_TAG: error = tagcontrol(cam_dev, argc, argv, combinedopt); break; case CAM_CMD_RATE: error = ratecontrol(cam_dev, task_attr, retry_count, timeout, argc, argv, combinedopt); break; case CAM_CMD_FORMAT: error = scsiformat(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_REPORTLUNS: error = scsireportluns(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_READCAP: error = scsireadcapacity(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_IDLE: case CAM_CMD_STANDBY: case CAM_CMD_SLEEP: case CAM_CMD_POWER_MODE: error = atapm(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_APM: case CAM_CMD_AAM: error = ataaxm(cam_dev, argc, argv, combinedopt, retry_count, timeout); break; case CAM_CMD_SECURITY: error = atasecurity(cam_dev, retry_count, timeout, argc, argv, combinedopt); break; case CAM_CMD_DOWNLOAD_FW: error = fwdownload(cam_dev, argc, argv, combinedopt, arglist & CAM_ARG_VERBOSE, task_attr, retry_count, timeout); break; case CAM_CMD_SANITIZE: error = sanitize(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout); break; case CAM_CMD_PERSIST: error = scsipersist(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout, arglist & CAM_ARG_VERBOSE, arglist & CAM_ARG_ERR_RECOVER); break; case CAM_CMD_ATTRIB: error = scsiattrib(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout, arglist & CAM_ARG_VERBOSE, arglist & CAM_ARG_ERR_RECOVER); break; case CAM_CMD_OPCODES: error = scsiopcodes(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout, arglist & CAM_ARG_VERBOSE); break; case CAM_CMD_REPROBE: error = reprobe(cam_dev); break; case CAM_CMD_ZONE: error = zone(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout, arglist & CAM_ARG_VERBOSE); break; case CAM_CMD_EPC: error = epc(cam_dev, argc, argv, combinedopt, retry_count, timeout, arglist & CAM_ARG_VERBOSE); break; case CAM_CMD_TIMESTAMP: error = timestamp(cam_dev, argc, argv, combinedopt, task_attr, retry_count, timeout, arglist & CAM_ARG_VERBOSE); break; case CAM_CMD_USAGE: usage(1); break; default: usage(0); error = 1; break; } if (cam_dev != NULL) cam_close_device(cam_dev); exit(error); } Index: projects/fuse2/sbin/camcontrol/timestamp.c =================================================================== --- projects/fuse2/sbin/camcontrol/timestamp.c (revision 350434) +++ projects/fuse2/sbin/camcontrol/timestamp.c (revision 350435) @@ -1,508 +1,501 @@ /*- * Copyright (c) 2016 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Ken Merry (Spectra Logic Corporation) * Reid Linnemann (Spectra Logic Corporation) * Samuel Klopsch (Spectra Logic Corporation) */ /* * SCSI tape drive timestamp support */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "camcontrol.h" #define TIMESTAMP_REPORT 0 #define TIMESTAMP_SET 1 #define MIL "milliseconds" #define UTC "utc" static int set_restore_flags(struct cam_device *device, uint8_t *flags, int set_flag, int task_attr, int retry_count, int timeout); static int report_timestamp(struct cam_device *device, uint64_t *ts, int task_attr, int retry_count, int timeout); static int set_timestamp(struct cam_device *device, char *format_string, char *timestamp_string, int task_attr, int retry_count, int timeout); static int set_restore_flags(struct cam_device *device, uint8_t *flags, int set_flag, int task_attr, int retry_count, int timeout) { unsigned long blk_desc_length, hdr_and_blk_length; int error = 0; struct scsi_control_ext_page *control_page = NULL; struct scsi_mode_header_10 *mode_hdr = NULL; - struct scsi_mode_sense_10 *cdb = NULL; union ccb *ccb = NULL; unsigned long mode_buf_size = sizeof(struct scsi_mode_header_10) + sizeof(struct scsi_mode_blk_desc) + sizeof(struct scsi_control_ext_page); uint8_t mode_buf[mode_buf_size]; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); error = 1; goto bailout; } /* * Get the control extension subpage, we'll send it back modified to * enable SCSI control over the tape drive's timestamp */ - scsi_mode_sense_len(&ccb->csio, + scsi_mode_sense_subpage(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*dbd*/ 0, /*page_control*/ SMS_PAGE_CTRL_CURRENT, /*page*/ SCEP_PAGE_CODE, + /*subpage*/ SCEP_SUBPAGE_CODE, /*param_buf*/ &mode_buf[0], /*param_len*/ mode_buf_size, /*minimum_cmd_size*/ 10, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); - /* - * scsi_mode_sense_len does not have a subpage argument at the moment, - * so we have to manually set the subpage code before calling - * cam_send_ccb(). - */ - cdb = (struct scsi_mode_sense_10 *)ccb->csio.cdb_io.cdb_bytes; - cdb->subpage = SCEP_SUBPAGE_CODE; ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count > 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; error = cam_send_ccb(device, ccb); if (error != 0) { warn("error sending Mode Sense"); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto bailout; } mode_hdr = (struct scsi_mode_header_10 *)&mode_buf[0]; blk_desc_length = scsi_2btoul(mode_hdr->blk_desc_len); hdr_and_blk_length = sizeof(struct scsi_mode_header_10)+blk_desc_length; /* * Create the control page at the correct point in the mode_buf, it * starts after the header and the blk description. */ assert(hdr_and_blk_length <= sizeof(mode_buf) - sizeof(struct scsi_control_ext_page)); control_page = (struct scsi_control_ext_page *)&mode_buf [hdr_and_blk_length]; if (set_flag != 0) { *flags = control_page->flags; /* * Set the SCSIP flag to enable SCSI to change the * tape drive's timestamp. */ control_page->flags |= SCEP_SCSIP; } else { control_page->flags = *flags; } scsi_mode_select_len(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*scsi_page_fmt*/ 1, /*save_pages*/ 0, /*param_buf*/ &mode_buf[0], /*param_len*/ mode_buf_size, /*minimum_cmd_size*/ 10, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count > 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; error = cam_send_ccb(device, ccb); if (error != 0) { warn("error sending Mode Select"); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto bailout; } bailout: if (ccb != NULL) cam_freeccb(ccb); return error; } static int report_timestamp(struct cam_device *device, uint64_t *ts, int task_attr, int retry_count, int timeout) { int error = 0; struct scsi_report_timestamp_data *report_buf = malloc( sizeof(struct scsi_report_timestamp_data)); uint8_t temp_timestamp[8]; uint32_t report_buf_size = sizeof( struct scsi_report_timestamp_data); union ccb *ccb = NULL; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); error = 1; goto bailout; } scsi_report_timestamp(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*pdf*/ 0, /*buf*/ report_buf, /*buf_len*/ report_buf_size, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count > 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; error = cam_send_ccb(device, ccb); if (error != 0) { warn("error sending Report Timestamp"); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto bailout; } bzero(temp_timestamp, sizeof(temp_timestamp)); memcpy(&temp_timestamp[2], &report_buf->timestamp, 6); *ts = scsi_8btou64(temp_timestamp); bailout: if (ccb != NULL) cam_freeccb(ccb); free(report_buf); return error; } static int set_timestamp(struct cam_device *device, char *format_string, char *timestamp_string, int task_attr, int retry_count, int timeout) { struct scsi_set_timestamp_parameters ts_p; time_t time_value; struct tm time_struct; uint8_t flags = 0; int error = 0; uint64_t ts = 0; union ccb *ccb = NULL; int do_restore_flags = 0; error = set_restore_flags(device, &flags, /*set_flag*/ 1, task_attr, retry_count, timeout); if (error != 0) goto bailout; do_restore_flags = 1; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: error allocating CCB", __func__); error = 1; goto bailout; } if (strcmp(format_string, UTC) == 0) { time(&time_value); ts = (uint64_t) time_value; } else { bzero(&time_struct, sizeof(struct tm)); if (strptime(timestamp_string, format_string, &time_struct) == NULL) { warnx("%s: strptime(3) failed", __func__); error = 1; goto bailout; } time_value = mktime(&time_struct); ts = (uint64_t) time_value; } /* Convert time from seconds to milliseconds */ ts *= 1000; bzero(&ts_p, sizeof(ts_p)); scsi_create_timestamp(ts_p.timestamp, ts); scsi_set_timestamp(&ccb->csio, /*retries*/ retry_count, /*cbfcnp*/ NULL, /*tag_action*/ task_attr, /*buf*/ &ts_p, /*buf_len*/ sizeof(ts_p), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout ? timeout : 5000); ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (retry_count > 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; error = cam_send_ccb(device, ccb); if (error != 0) { warn("error sending Set Timestamp"); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); error = 1; goto bailout; } printf("Timestamp set to %ju\n", (uintmax_t)ts); bailout: if (do_restore_flags != 0) error = set_restore_flags(device, &flags, /*set_flag*/ 0, task_attr, retry_count, timeout); if (ccb != NULL) cam_freeccb(ccb); return error; } int timestamp(struct cam_device *device, int argc, char **argv, char *combinedopt, int task_attr, int retry_count, int timeout, int verbosemode __unused) { int c; uint64_t ts = 0; char *format_string = NULL; char *timestamp_string = NULL; int action = -1; int error = 0; int single_arg = 0; int do_utc = 0; while ((c = getopt(argc, argv, combinedopt)) != -1) { switch (c) { case 'r': { if (action != -1) { warnx("Use only one -r or only one -s"); error =1; goto bailout; } action = TIMESTAMP_REPORT; break; } case 's': { if (action != -1) { warnx("Use only one -r or only one -s"); error = 1; goto bailout; } action = TIMESTAMP_SET; break; } case 'f': { single_arg++; free(format_string); format_string = strdup(optarg); if (format_string == NULL) { warn("Error allocating memory for format " "argument"); error = 1; goto bailout; } break; } case 'm': { single_arg++; free(format_string); format_string = strdup(MIL); if (format_string == NULL) { warn("Error allocating memory"); error = 1; goto bailout; } break; } case 'U': { do_utc = 1; break; } case 'T': free(timestamp_string); timestamp_string = strdup(optarg); if (timestamp_string == NULL) { warn("Error allocating memory for format " "argument"); error = 1; goto bailout; } break; default: break; } } if (action == -1) { warnx("Must specify an action, either -r or -s"); error = 1; goto bailout; } if (single_arg > 1) { warnx("Select only one: %s", (action == TIMESTAMP_REPORT) ? "-f format or -m for the -r flag" : "-f format -T time or -U for the -s flag"); error = 1; goto bailout; } if (action == TIMESTAMP_SET) { if ((format_string == NULL) && (do_utc == 0)) { warnx("Must specify either -f format or -U for " "setting the timestamp"); error = 1; } else if ((format_string != NULL) && (do_utc != 0)) { warnx("Must specify only one of -f or -U to set " "the timestamp"); error = 1; } else if ((format_string != NULL) && (strcmp(format_string, MIL) == 0)) { warnx("-m is not allowed for setting the " "timestamp"); error = 1; } else if ((do_utc == 0) && (timestamp_string == NULL)) { warnx("Must specify the time (-T) to set as the " "timestamp"); error = 1; } if (error != 0) goto bailout; } else if (action == TIMESTAMP_REPORT) { if (format_string == NULL) { format_string = strdup("%c %Z"); if (format_string == NULL) { warn("Error allocating memory for format " "string"); error = 1; goto bailout; } } } if (action == TIMESTAMP_REPORT) { error = report_timestamp(device, &ts, task_attr, retry_count, timeout); if (error != 0) { goto bailout; } else if (strcmp(format_string, MIL) == 0) { printf("Timestamp in milliseconds: %ju\n", (uintmax_t)ts); } else { char temp_timestamp_string[100]; time_t time_var = ts / 1000; const struct tm *restrict cur_time; setlocale(LC_TIME, ""); if (do_utc != 0) cur_time = gmtime(&time_var); else cur_time = localtime(&time_var); strftime(temp_timestamp_string, sizeof(temp_timestamp_string), format_string, cur_time); printf("Formatted timestamp: %s\n", temp_timestamp_string); } } else if (action == TIMESTAMP_SET) { if (do_utc != 0) { format_string = strdup(UTC); if (format_string == NULL) { warn("Error allocating memory for format " "string"); error = 1; goto bailout; } } error = set_timestamp(device, format_string, timestamp_string, task_attr, retry_count, timeout); } bailout: free(format_string); free(timestamp_string); return (error); } Index: projects/fuse2/sbin/nvmecontrol/identify_ext.c =================================================================== --- projects/fuse2/sbin/nvmecontrol/identify_ext.c (revision 350434) +++ projects/fuse2/sbin/nvmecontrol/identify_ext.c (revision 350435) @@ -1,223 +1,246 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "nvmecontrol.h" #include "nvmecontrol_ext.h" void nvme_print_controller(struct nvme_controller_data *cdata) { uint8_t str[128]; char cbuf[UINT128_DIG + 1]; uint16_t oncs, oacs; - uint8_t compare, write_unc, dsm, vwc_present; + uint8_t compare, write_unc, dsm, t; uint8_t security, fmt, fw, nsmgmt; uint8_t fw_slot1_ro, fw_num_slots; uint8_t ns_smart; uint8_t sqes_max, sqes_min; uint8_t cqes_max, cqes_min; oncs = cdata->oncs; compare = (oncs >> NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT) & NVME_CTRLR_DATA_ONCS_COMPARE_MASK; write_unc = (oncs >> NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT) & NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK; dsm = (oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) & NVME_CTRLR_DATA_ONCS_DSM_MASK; - vwc_present = (cdata->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) & - NVME_CTRLR_DATA_VWC_PRESENT_MASK; oacs = cdata->oacs; security = (oacs >> NVME_CTRLR_DATA_OACS_SECURITY_SHIFT) & NVME_CTRLR_DATA_OACS_SECURITY_MASK; fmt = (oacs >> NVME_CTRLR_DATA_OACS_FORMAT_SHIFT) & NVME_CTRLR_DATA_OACS_FORMAT_MASK; fw = (oacs >> NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT) & NVME_CTRLR_DATA_OACS_FIRMWARE_MASK; nsmgmt = (oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) & NVME_CTRLR_DATA_OACS_NSMGMT_MASK; fw_num_slots = (cdata->frmw >> NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT) & NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK; fw_slot1_ro = (cdata->frmw >> NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT) & NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK; ns_smart = (cdata->lpa >> NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT) & NVME_CTRLR_DATA_LPA_NS_SMART_MASK; sqes_min = (cdata->sqes >> NVME_CTRLR_DATA_SQES_MIN_SHIFT) & NVME_CTRLR_DATA_SQES_MIN_MASK; sqes_max = (cdata->sqes >> NVME_CTRLR_DATA_SQES_MAX_SHIFT) & NVME_CTRLR_DATA_SQES_MAX_MASK; cqes_min = (cdata->cqes >> NVME_CTRLR_DATA_CQES_MIN_SHIFT) & NVME_CTRLR_DATA_CQES_MIN_MASK; cqes_max = (cdata->cqes >> NVME_CTRLR_DATA_CQES_MAX_SHIFT) & NVME_CTRLR_DATA_CQES_MAX_MASK; printf("Controller Capabilities/Features\n"); printf("================================\n"); printf("Vendor ID: %04x\n", cdata->vid); printf("Subsystem Vendor ID: %04x\n", cdata->ssvid); nvme_strvis(str, cdata->sn, sizeof(str), NVME_SERIAL_NUMBER_LENGTH); printf("Serial Number: %s\n", str); nvme_strvis(str, cdata->mn, sizeof(str), NVME_MODEL_NUMBER_LENGTH); printf("Model Number: %s\n", str); nvme_strvis(str, cdata->fr, sizeof(str), NVME_FIRMWARE_REVISION_LENGTH); printf("Firmware Version: %s\n", str); printf("Recommended Arb Burst: %d\n", cdata->rab); printf("IEEE OUI Identifier: %02x %02x %02x\n", cdata->ieee[0], cdata->ieee[1], cdata->ieee[2]); - printf("Multi-Path I/O Capabilities: %s%s%s%s\n", + printf("Multi-Path I/O Capabilities: %s%s%s%s%s\n", (cdata->mic == 0) ? "Not Supported" : "", + ((cdata->mic >> NVME_CTRLR_DATA_MIC_ANAR_SHIFT) & + NVME_CTRLR_DATA_MIC_SRIOVVF_MASK) ? "Asymmetric, " : "", ((cdata->mic >> NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT) & NVME_CTRLR_DATA_MIC_SRIOVVF_MASK) ? "SR-IOV VF, " : "", ((cdata->mic >> NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT) & NVME_CTRLR_DATA_MIC_MCTRLRS_MASK) ? "Multiple controllers, " : "", ((cdata->mic >> NVME_CTRLR_DATA_MIC_MPORTS_SHIFT) & NVME_CTRLR_DATA_MIC_MPORTS_MASK) ? "Multiple ports" : ""); /* TODO: Use CAP.MPSMIN to determine true memory page size. */ printf("Max Data Transfer Size: "); if (cdata->mdts == 0) printf("Unlimited\n"); else printf("%ld\n", PAGE_SIZE * (1L << cdata->mdts)); printf("Controller ID: 0x%02x\n", cdata->ctrlr_id); printf("Version: %d.%d.%d\n", (cdata->ver >> 16) & 0xffff, (cdata->ver >> 8) & 0xff, cdata->ver & 0xff); printf("\n"); printf("Admin Command Set Attributes\n"); printf("============================\n"); printf("Security Send/Receive: %s\n", security ? "Supported" : "Not Supported"); printf("Format NVM: %s\n", fmt ? "Supported" : "Not Supported"); printf("Firmware Activate/Download: %s\n", fw ? "Supported" : "Not Supported"); printf("Namespace Managment: %s\n", nsmgmt ? "Supported" : "Not Supported"); printf("Device Self-test: %sSupported\n", ((oacs >> NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT) & NVME_CTRLR_DATA_OACS_SELFTEST_MASK) ? "" : "Not "); printf("Directives: %sSupported\n", ((oacs >> NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT) & NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK) ? "" : "Not "); printf("NVMe-MI Send/Receive: %sSupported\n", ((oacs >> NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT) & NVME_CTRLR_DATA_OACS_NVMEMI_MASK) ? "" : "Not "); printf("Virtualization Management: %sSupported\n", ((oacs >> NVME_CTRLR_DATA_OACS_VM_SHIFT) & NVME_CTRLR_DATA_OACS_VM_MASK) ? "" : "Not "); - printf("Doorbell Buffer Config %sSupported\n", + printf("Doorbell Buffer Config: %sSupported\n", ((oacs >> NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT) & NVME_CTRLR_DATA_OACS_DBBUFFER_MASK) ? "" : "Not "); + printf("Get LBA Status: %sSupported\n", + ((oacs >> NVME_CTRLR_DATA_OACS_GETLBA_SHIFT) & + NVME_CTRLR_DATA_OACS_GETLBA_MASK) ? "" : "Not "); + printf("Sanitize: "); + if (cdata->sanicap != 0) { + printf("%s%s%s\n", + ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_CES_SHIFT) & + NVME_CTRLR_DATA_SANICAP_CES_SHIFT) ? "crypto, " : "", + ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_BES_SHIFT) & + NVME_CTRLR_DATA_SANICAP_BES_SHIFT) ? "block, " : "", + ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_OWS_SHIFT) & + NVME_CTRLR_DATA_SANICAP_OWS_SHIFT) ? "overwrite" : ""); + } else { + printf("Not Supported\n"); + } printf("Abort Command Limit: %d\n", cdata->acl+1); printf("Async Event Request Limit: %d\n", cdata->aerl+1); printf("Number of Firmware Slots: "); if (fw != 0) printf("%d\n", fw_num_slots); else printf("N/A\n"); printf("Firmware Slot 1 Read-Only: "); if (fw != 0) printf("%s\n", fw_slot1_ro ? "Yes" : "No"); else printf("N/A\n"); printf("Per-Namespace SMART Log: %s\n", ns_smart ? "Yes" : "No"); printf("Error Log Page Entries: %d\n", cdata->elpe+1); printf("Number of Power States: %d\n", cdata->npss+1); printf("\n"); printf("NVM Command Set Attributes\n"); printf("==========================\n"); printf("Submission Queue Entry Size\n"); printf(" Max: %d\n", 1 << sqes_max); printf(" Min: %d\n", 1 << sqes_min); printf("Completion Queue Entry Size\n"); printf(" Max: %d\n", 1 << cqes_max); printf(" Min: %d\n", 1 << cqes_min); printf("Number of Namespaces: %d\n", cdata->nn); printf("Compare Command: %s\n", compare ? "Supported" : "Not Supported"); printf("Write Uncorrectable Command: %s\n", write_unc ? "Supported" : "Not Supported"); printf("Dataset Management Command: %s\n", dsm ? "Supported" : "Not Supported"); printf("Write Zeroes Command: %sSupported\n", ((oncs >> NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT) & NVME_CTRLR_DATA_ONCS_WRZERO_MASK) ? "" : "Not "); printf("Save Features: %sSupported\n", ((oncs >> NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT) & NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK) ? "" : "Not "); printf("Reservations: %sSupported\n", ((oncs >> NVME_CTRLR_DATA_ONCS_RESERV_SHIFT) & NVME_CTRLR_DATA_ONCS_RESERV_MASK) ? "" : "Not "); printf("Timestamp feature: %sSupported\n", ((oncs >> NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT) & NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK) ? "" : "Not "); + printf("Verify feature: %sSupported\n", + ((oncs >> NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT) & + NVME_CTRLR_DATA_ONCS_VERIFY_MASK) ? "" : "Not "); printf("Fused Operation Support: %s%s\n", (cdata->fuses == 0) ? "Not Supported" : "", ((cdata->fuses >> NVME_CTRLR_DATA_FUSES_CNW_SHIFT) & NVME_CTRLR_DATA_FUSES_CNW_MASK) ? "Compare and Write" : ""); printf("Format NVM Attributes: %s%s Erase, %s Format\n", ((cdata->fna >> NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT) & NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK) ? "Crypto Erase, " : "", ((cdata->fna >> NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT) & NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK) ? "All-NVM" : "Per-NS", ((cdata->fna >> NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT) & NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK) ? "All-NVM" : "Per-NS"); - printf("Volatile Write Cache: %s\n", - vwc_present ? "Present" : "Not Present"); + t = (cdata->vwc >> NVME_CTRLR_DATA_VWC_ALL_SHIFT) & + NVME_CTRLR_DATA_VWC_ALL_MASK; + printf("Volatile Write Cache: %s%s\n", + ((cdata->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) & + NVME_CTRLR_DATA_VWC_PRESENT_MASK) ? "Present" : "Not Present", + (t == NVME_CTRLR_DATA_VWC_ALL_NO) ? ", no flush all" : + (t == NVME_CTRLR_DATA_VWC_ALL_YES) ? ", flush all" : ""); if (nsmgmt) { printf("\n"); printf("Namespace Drive Attributes\n"); printf("==========================\n"); printf("NVM total cap: %s\n", uint128_to_str(to128(cdata->untncap.tnvmcap), cbuf, sizeof(cbuf))); printf("NVM unallocated cap: %s\n", uint128_to_str(to128(cdata->untncap.unvmcap), cbuf, sizeof(cbuf))); } } Index: projects/fuse2/share/man/man4/cc_dctcp.4 =================================================================== --- projects/fuse2/share/man/man4/cc_dctcp.4 (revision 350434) +++ projects/fuse2/share/man/man4/cc_dctcp.4 (revision 350435) @@ -1,133 +1,146 @@ .\" .\" Copyright (c) 2014 Midori Kato .\" Copyright (c) 2014 The FreeBSD Foundation .\" All rights reserved. .\" .\" Portions of this documentation were written at Keio University, Japan. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 12, 2015 +.Dd July 29, 2019 .Dt CC_DCTCP 4 .Os .Sh NAME .Nm cc_dctcp .Nd DCTCP Congestion Control Algorithm .Sh DESCRIPTION The DCTCP (data center TCP) congestion control algorithm aims to maximise throughput and minimise latency in data center networks by utilising the proportion of Explicit Congestion Notification (ECN) marks received from capable hardware as a congestion signal. .Pp DCTCP uses fraction of ECN marked packets to update congestion window. The window reduction ratio is always <= 1/2. Only when all of the packets are marked, congestion window is halved. .Pp In order to keep the accuracy of the ECN marked fraction, a DCTCP receiver mirrors back incoming (or missing) CE marks by setting (or clearing) ECE marks. This feedback methodology is also adopted when the receiver uses delayed ACK. .Pp The .Fx DCTCP implementation includes two minor modifications for the one-sided deployment. Considering the situation that DCTCP is used as sender and classic ECN is used as receiver, DCTCP sets the CWR flag as the reaction to the ECE flag. In addition, when classic ECN is used as sender and DCTCP is used as receiver, DCTCP avoids to mirror back ACKs only when the CWR flag is set in the incoming packet. .Pp -The other specifications are based on the paper and Internet Draft referenced +The other specifications are based on the paper and the RFC referenced in the .Sx SEE ALSO section below. .Sh MIB Variables The algorithm exposes the following tunable variables in the .Va net.inet.tcp.cc.dctcp branch of the .Xr sysctl 3 MIB: -.Bl -tag -width ".Va alpha" +.Bl -tag -width ".Va slowstart" .It Va alpha -An initial estimator of the congestion on the link. -Default is 0. -.It Va dctcp_shift_g -An estimation gain in the alpha calculation. -Default is 16. +The initial value to estimate the congestion on the link. +The valid range is from 0 to 1024, where 1024 reduces the congestion +window to half, if a CE is observed in the first window and +.Va alpha +could not yet adjust to the congestion level on that path. +Default is 1024. +.It Va shift_g +An estimation gain in the +.Va alpha +calculation. +This influences the responsiveness when adjusting alpha +to the most recent observed window. +Valid range from 0 to 10, the default is 4, resulting in an effective +gain of 1 / ( 2 ^ +.Va shift_g +), or 1/16th. .It Va slowstart -A trigger to halve congestion window after slow start. -Default does nothing to halve window. +A flag if the congestion window should be reduced by one half after slow start. +Valid settings 0 and 1, default 0. .El .Sh SEE ALSO .Xr cc_chd 4 , .Xr cc_cubic 4 , .Xr cc_hd 4 , .Xr cc_htcp 4 , .Xr cc_newreno 4 , .Xr cc_vegas 4 , .Xr mod_cc 4 , .Xr tcp 4 , .Xr mod_cc 9 .Rs .%A "Mohammad Alizadeh" .%A "Albert Greenberg" .%A "David A. Maltz" .%A "Jitendra Padhye" .%A "Parveen Patel" .%A "Balaji Prabhakar" .%A "Sudipta Sengupta" .%A "Murari Sridharan" .%T "Data Center TCP (DCTCP)" .%U "http://research.microsoft.com/pubs/121386/dctcp-public.pdf" .%J "ACM SIGCOMM 2010" .%D "July 2010" .%P "63-74" .Re .Rs .%A "Stephen Bensley" -.%A "Lars Eggert" .%A "Dave Thaler" -.%T "Microsoft's Datacenter TCP (DCTCP): TCP Congestion Control for Datacenters" -.%U "http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-01" +.%A "Praveen Balasubramanian" +.%A "Lars Eggert" +.%A "Glenn Judd" +.%T "Data Center TCP (DCTCP): TCP Congestion Control for Data Centers" +.%U "https://tools.ietf.org/html/rfc8257" .Re .Sh HISTORY The .Nm congestion control module first appeared in .Fx 11.0 . .Pp The module was first released in 2014 by Midori Kato studying at Keio University, Japan. .Sh AUTHORS .An -nosplit The .Nm congestion control module and this manual page were written by .An Midori Kato Mt katoon@sfc.wide.ad.jp and .An Lars Eggert Mt lars@netapp.com with help and modifications from .An Hiren Panchasara Mt hiren@FreeBSD.org Index: projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c =================================================================== --- projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c (revision 350434) +++ projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c (revision 350435) @@ -1,2995 +1,2997 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Damjan Marion * Copyright (c) 2016 Rubicon Communications, LLC (Netgate) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * TI Common Platform Ethernet Switch (CPSW) Driver * Found in TI8148 "DaVinci" and AM335x "Sitara" SoCs. * * This controller is documented in the AM335x Technical Reference * Manual, in the TMS320DM814x DaVinci Digital Video Processors TRM * and in the TMS320C6452 3 Port Switch Ethernet Subsystem TRM. * * It is basically a single Ethernet port (port 0) wired internally to * a 3-port store-and-forward switch connected to two independent * "sliver" controllers (port 1 and port 2). You can operate the * controller in a variety of different ways by suitably configuring * the slivers and the Address Lookup Engine (ALE) that routes packets * between the ports. * * This code was developed and tested on a BeagleBone with * an AM335x SoC. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpsw.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CPSW_ETHERSWITCH #include #include "etherswitch_if.h" #endif #include "if_cpswreg.h" #include "if_cpswvar.h" #include "miibus_if.h" /* Device probe/attach/detach. */ static int cpsw_probe(device_t); static int cpsw_attach(device_t); static int cpsw_detach(device_t); static int cpswp_probe(device_t); static int cpswp_attach(device_t); static int cpswp_detach(device_t); static phandle_t cpsw_get_node(device_t, device_t); /* Device Init/shutdown. */ static int cpsw_shutdown(device_t); static void cpswp_init(void *); static void cpswp_init_locked(void *); static void cpswp_stop_locked(struct cpswp_softc *); /* Device Suspend/Resume. */ static int cpsw_suspend(device_t); static int cpsw_resume(device_t); /* Ioctl. */ static int cpswp_ioctl(struct ifnet *, u_long command, caddr_t data); static int cpswp_miibus_readreg(device_t, int phy, int reg); static int cpswp_miibus_writereg(device_t, int phy, int reg, int value); static void cpswp_miibus_statchg(device_t); /* Send/Receive packets. */ static void cpsw_intr_rx(void *arg); static struct mbuf *cpsw_rx_dequeue(struct cpsw_softc *); static void cpsw_rx_enqueue(struct cpsw_softc *); static void cpswp_start(struct ifnet *); static void cpsw_intr_tx(void *); static void cpswp_tx_enqueue(struct cpswp_softc *); static int cpsw_tx_dequeue(struct cpsw_softc *); /* Misc interrupts and watchdog. */ static void cpsw_intr_rx_thresh(void *); static void cpsw_intr_misc(void *); static void cpswp_tick(void *); static void cpswp_ifmedia_sts(struct ifnet *, struct ifmediareq *); static int cpswp_ifmedia_upd(struct ifnet *); static void cpsw_tx_watchdog(void *); /* ALE support */ static void cpsw_ale_read_entry(struct cpsw_softc *, uint16_t, uint32_t *); static void cpsw_ale_write_entry(struct cpsw_softc *, uint16_t, uint32_t *); static int cpsw_ale_mc_entry_set(struct cpsw_softc *, uint8_t, int, uint8_t *); static void cpsw_ale_dump_table(struct cpsw_softc *); static int cpsw_ale_update_vlan_table(struct cpsw_softc *, int, int, int, int, int); static int cpswp_ale_update_addresses(struct cpswp_softc *, int); /* Statistics and sysctls. */ static void cpsw_add_sysctls(struct cpsw_softc *); static void cpsw_stats_collect(struct cpsw_softc *); static int cpsw_stats_sysctl(SYSCTL_HANDLER_ARGS); #ifdef CPSW_ETHERSWITCH static etherswitch_info_t *cpsw_getinfo(device_t); static int cpsw_getport(device_t, etherswitch_port_t *); static int cpsw_setport(device_t, etherswitch_port_t *); static int cpsw_getconf(device_t, etherswitch_conf_t *); static int cpsw_getvgroup(device_t, etherswitch_vlangroup_t *); static int cpsw_setvgroup(device_t, etherswitch_vlangroup_t *); static int cpsw_readreg(device_t, int); static int cpsw_writereg(device_t, int, int); static int cpsw_readphy(device_t, int, int); static int cpsw_writephy(device_t, int, int, int); #endif /* * Arbitrary limit on number of segments in an mbuf to be transmitted. * Packets with more segments than this will be defragmented before * they are queued. */ #define CPSW_TXFRAGS 16 /* Shared resources. */ static device_method_t cpsw_methods[] = { /* Device interface */ DEVMETHOD(device_probe, cpsw_probe), DEVMETHOD(device_attach, cpsw_attach), DEVMETHOD(device_detach, cpsw_detach), DEVMETHOD(device_shutdown, cpsw_shutdown), DEVMETHOD(device_suspend, cpsw_suspend), DEVMETHOD(device_resume, cpsw_resume), /* Bus interface */ DEVMETHOD(bus_add_child, device_add_child_ordered), /* OFW methods */ DEVMETHOD(ofw_bus_get_node, cpsw_get_node), #ifdef CPSW_ETHERSWITCH /* etherswitch interface */ DEVMETHOD(etherswitch_getinfo, cpsw_getinfo), DEVMETHOD(etherswitch_readreg, cpsw_readreg), DEVMETHOD(etherswitch_writereg, cpsw_writereg), DEVMETHOD(etherswitch_readphyreg, cpsw_readphy), DEVMETHOD(etherswitch_writephyreg, cpsw_writephy), DEVMETHOD(etherswitch_getport, cpsw_getport), DEVMETHOD(etherswitch_setport, cpsw_setport), DEVMETHOD(etherswitch_getvgroup, cpsw_getvgroup), DEVMETHOD(etherswitch_setvgroup, cpsw_setvgroup), DEVMETHOD(etherswitch_getconf, cpsw_getconf), #endif DEVMETHOD_END }; static driver_t cpsw_driver = { "cpswss", cpsw_methods, sizeof(struct cpsw_softc), }; static devclass_t cpsw_devclass; DRIVER_MODULE(cpswss, simplebus, cpsw_driver, cpsw_devclass, 0, 0); /* Port/Slave resources. */ static device_method_t cpswp_methods[] = { /* Device interface */ DEVMETHOD(device_probe, cpswp_probe), DEVMETHOD(device_attach, cpswp_attach), DEVMETHOD(device_detach, cpswp_detach), /* MII interface */ DEVMETHOD(miibus_readreg, cpswp_miibus_readreg), DEVMETHOD(miibus_writereg, cpswp_miibus_writereg), DEVMETHOD(miibus_statchg, cpswp_miibus_statchg), DEVMETHOD_END }; static driver_t cpswp_driver = { "cpsw", cpswp_methods, sizeof(struct cpswp_softc), }; static devclass_t cpswp_devclass; #ifdef CPSW_ETHERSWITCH DRIVER_MODULE(etherswitch, cpswss, etherswitch_driver, etherswitch_devclass, 0, 0); MODULE_DEPEND(cpswss, etherswitch, 1, 1, 1); #endif DRIVER_MODULE(cpsw, cpswss, cpswp_driver, cpswp_devclass, 0, 0); DRIVER_MODULE(miibus, cpsw, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(cpsw, ether, 1, 1, 1); MODULE_DEPEND(cpsw, miibus, 1, 1, 1); #ifdef CPSW_ETHERSWITCH static struct cpsw_vlangroups cpsw_vgroups[CPSW_VLANS]; #endif static uint32_t slave_mdio_addr[] = { 0x4a100200, 0x4a100300 }; static struct resource_spec irq_res_spec[] = { { SYS_RES_IRQ, 0, RF_ACTIVE | RF_SHAREABLE }, { SYS_RES_IRQ, 1, RF_ACTIVE | RF_SHAREABLE }, { SYS_RES_IRQ, 2, RF_ACTIVE | RF_SHAREABLE }, { SYS_RES_IRQ, 3, RF_ACTIVE | RF_SHAREABLE }, { -1, 0 } }; static struct { void (*cb)(void *); } cpsw_intr_cb[] = { { cpsw_intr_rx_thresh }, { cpsw_intr_rx }, { cpsw_intr_tx }, { cpsw_intr_misc }, }; /* Number of entries here must match size of stats * array in struct cpswp_softc. */ static struct cpsw_stat { int reg; char *oid; } cpsw_stat_sysctls[CPSW_SYSCTL_COUNT] = { {0x00, "GoodRxFrames"}, {0x04, "BroadcastRxFrames"}, {0x08, "MulticastRxFrames"}, {0x0C, "PauseRxFrames"}, {0x10, "RxCrcErrors"}, {0x14, "RxAlignErrors"}, {0x18, "OversizeRxFrames"}, {0x1c, "RxJabbers"}, {0x20, "ShortRxFrames"}, {0x24, "RxFragments"}, {0x30, "RxOctets"}, {0x34, "GoodTxFrames"}, {0x38, "BroadcastTxFrames"}, {0x3c, "MulticastTxFrames"}, {0x40, "PauseTxFrames"}, {0x44, "DeferredTxFrames"}, {0x48, "CollisionsTxFrames"}, {0x4c, "SingleCollisionTxFrames"}, {0x50, "MultipleCollisionTxFrames"}, {0x54, "ExcessiveCollisions"}, {0x58, "LateCollisions"}, {0x5c, "TxUnderrun"}, {0x60, "CarrierSenseErrors"}, {0x64, "TxOctets"}, {0x68, "RxTx64OctetFrames"}, {0x6c, "RxTx65to127OctetFrames"}, {0x70, "RxTx128to255OctetFrames"}, {0x74, "RxTx256to511OctetFrames"}, {0x78, "RxTx512to1024OctetFrames"}, {0x7c, "RxTx1024upOctetFrames"}, {0x80, "NetOctets"}, {0x84, "RxStartOfFrameOverruns"}, {0x88, "RxMiddleOfFrameOverruns"}, {0x8c, "RxDmaOverruns"} }; /* * Basic debug support. */ static void cpsw_debugf_head(const char *funcname) { int t = (int)(time_second % (24 * 60 * 60)); printf("%02d:%02d:%02d %s ", t / (60 * 60), (t / 60) % 60, t % 60, funcname); } static void cpsw_debugf(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } #define CPSW_DEBUGF(_sc, a) do { \ if ((_sc)->debug) { \ cpsw_debugf_head(__func__); \ cpsw_debugf a; \ } \ } while (0) /* * Locking macros */ #define CPSW_TX_LOCK(sc) do { \ mtx_assert(&(sc)->rx.lock, MA_NOTOWNED); \ mtx_lock(&(sc)->tx.lock); \ } while (0) #define CPSW_TX_UNLOCK(sc) mtx_unlock(&(sc)->tx.lock) #define CPSW_TX_LOCK_ASSERT(sc) mtx_assert(&(sc)->tx.lock, MA_OWNED) #define CPSW_RX_LOCK(sc) do { \ mtx_assert(&(sc)->tx.lock, MA_NOTOWNED); \ mtx_lock(&(sc)->rx.lock); \ } while (0) #define CPSW_RX_UNLOCK(sc) mtx_unlock(&(sc)->rx.lock) #define CPSW_RX_LOCK_ASSERT(sc) mtx_assert(&(sc)->rx.lock, MA_OWNED) #define CPSW_PORT_LOCK(_sc) do { \ mtx_assert(&(_sc)->lock, MA_NOTOWNED); \ mtx_lock(&(_sc)->lock); \ } while (0) #define CPSW_PORT_UNLOCK(_sc) mtx_unlock(&(_sc)->lock) #define CPSW_PORT_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->lock, MA_OWNED) /* * Read/Write macros */ #define cpsw_read_4(_sc, _reg) bus_read_4((_sc)->mem_res, (_reg)) #define cpsw_write_4(_sc, _reg, _val) \ bus_write_4((_sc)->mem_res, (_reg), (_val)) #define cpsw_cpdma_bd_offset(i) (CPSW_CPPI_RAM_OFFSET + ((i)*16)) #define cpsw_cpdma_bd_paddr(sc, slot) \ BUS_SPACE_PHYSADDR(sc->mem_res, slot->bd_offset) #define cpsw_cpdma_read_bd(sc, slot, val) \ bus_read_region_4(sc->mem_res, slot->bd_offset, (uint32_t *) val, 4) #define cpsw_cpdma_write_bd(sc, slot, val) \ bus_write_region_4(sc->mem_res, slot->bd_offset, (uint32_t *) val, 4) #define cpsw_cpdma_write_bd_next(sc, slot, next_slot) \ cpsw_write_4(sc, slot->bd_offset, cpsw_cpdma_bd_paddr(sc, next_slot)) #define cpsw_cpdma_write_bd_flags(sc, slot, val) \ bus_write_2(sc->mem_res, slot->bd_offset + 14, val) #define cpsw_cpdma_read_bd_flags(sc, slot) \ bus_read_2(sc->mem_res, slot->bd_offset + 14) #define cpsw_write_hdp_slot(sc, queue, slot) \ cpsw_write_4(sc, (queue)->hdp_offset, cpsw_cpdma_bd_paddr(sc, slot)) #define CP_OFFSET (CPSW_CPDMA_TX_CP(0) - CPSW_CPDMA_TX_HDP(0)) #define cpsw_read_cp(sc, queue) \ cpsw_read_4(sc, (queue)->hdp_offset + CP_OFFSET) #define cpsw_write_cp(sc, queue, val) \ cpsw_write_4(sc, (queue)->hdp_offset + CP_OFFSET, (val)) #define cpsw_write_cp_slot(sc, queue, slot) \ cpsw_write_cp(sc, queue, cpsw_cpdma_bd_paddr(sc, slot)) #if 0 /* XXX temporary function versions for debugging. */ static void cpsw_write_hdp_slotX(struct cpsw_softc *sc, struct cpsw_queue *queue, struct cpsw_slot *slot) { uint32_t reg = queue->hdp_offset; uint32_t v = cpsw_cpdma_bd_paddr(sc, slot); CPSW_DEBUGF(("HDP <=== 0x%08x (was 0x%08x)", v, cpsw_read_4(sc, reg))); cpsw_write_4(sc, reg, v); } static void cpsw_write_cp_slotX(struct cpsw_softc *sc, struct cpsw_queue *queue, struct cpsw_slot *slot) { uint32_t v = cpsw_cpdma_bd_paddr(sc, slot); CPSW_DEBUGF(("CP <=== 0x%08x (expecting 0x%08x)", v, cpsw_read_cp(sc, queue))); cpsw_write_cp(sc, queue, v); } #endif /* * Expanded dump routines for verbose debugging. */ static void cpsw_dump_slot(struct cpsw_softc *sc, struct cpsw_slot *slot) { static const char *flags[] = {"SOP", "EOP", "Owner", "EOQ", "TDownCmplt", "PassCRC", "Long", "Short", "MacCtl", "Overrun", "PktErr1", "PortEn/PktErr0", "RxVlanEncap", "Port2", "Port1", "Port0"}; struct cpsw_cpdma_bd bd; const char *sep; int i; cpsw_cpdma_read_bd(sc, slot, &bd); printf("BD Addr : 0x%08x Next : 0x%08x\n", cpsw_cpdma_bd_paddr(sc, slot), bd.next); printf(" BufPtr: 0x%08x BufLen: 0x%08x\n", bd.bufptr, bd.buflen); printf(" BufOff: 0x%08x PktLen: 0x%08x\n", bd.bufoff, bd.pktlen); printf(" Flags: "); sep = ""; for (i = 0; i < 16; ++i) { if (bd.flags & (1 << (15 - i))) { printf("%s%s", sep, flags[i]); sep = ","; } } printf("\n"); if (slot->mbuf) { printf(" Ether: %14D\n", (char *)(slot->mbuf->m_data), " "); printf(" Packet: %16D\n", (char *)(slot->mbuf->m_data) + 14, " "); } } #define CPSW_DUMP_SLOT(cs, slot) do { \ IF_DEBUG(sc) { \ cpsw_dump_slot(sc, slot); \ } \ } while (0) static void cpsw_dump_queue(struct cpsw_softc *sc, struct cpsw_slots *q) { struct cpsw_slot *slot; int i = 0; int others = 0; STAILQ_FOREACH(slot, q, next) { if (i > CPSW_TXFRAGS) ++others; else cpsw_dump_slot(sc, slot); ++i; } if (others) printf(" ... and %d more.\n", others); printf("\n"); } #define CPSW_DUMP_QUEUE(sc, q) do { \ IF_DEBUG(sc) { \ cpsw_dump_queue(sc, q); \ } \ } while (0) static void cpsw_init_slots(struct cpsw_softc *sc) { struct cpsw_slot *slot; int i; STAILQ_INIT(&sc->avail); /* Put the slot descriptors onto the global avail list. */ for (i = 0; i < nitems(sc->_slots); i++) { slot = &sc->_slots[i]; slot->bd_offset = cpsw_cpdma_bd_offset(i); STAILQ_INSERT_TAIL(&sc->avail, slot, next); } } static int cpsw_add_slots(struct cpsw_softc *sc, struct cpsw_queue *queue, int requested) { const int max_slots = nitems(sc->_slots); struct cpsw_slot *slot; int i; if (requested < 0) requested = max_slots; for (i = 0; i < requested; ++i) { slot = STAILQ_FIRST(&sc->avail); if (slot == NULL) return (0); if (bus_dmamap_create(sc->mbuf_dtag, 0, &slot->dmamap)) { device_printf(sc->dev, "failed to create dmamap\n"); return (ENOMEM); } STAILQ_REMOVE_HEAD(&sc->avail, next); STAILQ_INSERT_TAIL(&queue->avail, slot, next); ++queue->avail_queue_len; ++queue->queue_slots; } return (0); } static void cpsw_free_slot(struct cpsw_softc *sc, struct cpsw_slot *slot) { int error; if (slot->dmamap) { if (slot->mbuf) bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap); error = bus_dmamap_destroy(sc->mbuf_dtag, slot->dmamap); KASSERT(error == 0, ("Mapping still active")); slot->dmamap = NULL; } if (slot->mbuf) { m_freem(slot->mbuf); slot->mbuf = NULL; } } static void cpsw_reset(struct cpsw_softc *sc) { int i; callout_stop(&sc->watchdog.callout); /* Reset RMII/RGMII wrapper. */ cpsw_write_4(sc, CPSW_WR_SOFT_RESET, 1); while (cpsw_read_4(sc, CPSW_WR_SOFT_RESET) & 1) ; /* Disable TX and RX interrupts for all cores. */ for (i = 0; i < 3; ++i) { cpsw_write_4(sc, CPSW_WR_C_RX_THRESH_EN(i), 0x00); cpsw_write_4(sc, CPSW_WR_C_TX_EN(i), 0x00); cpsw_write_4(sc, CPSW_WR_C_RX_EN(i), 0x00); cpsw_write_4(sc, CPSW_WR_C_MISC_EN(i), 0x00); } /* Reset CPSW subsystem. */ cpsw_write_4(sc, CPSW_SS_SOFT_RESET, 1); while (cpsw_read_4(sc, CPSW_SS_SOFT_RESET) & 1) ; /* Reset Sliver port 1 and 2 */ for (i = 0; i < 2; i++) { /* Reset */ cpsw_write_4(sc, CPSW_SL_SOFT_RESET(i), 1); while (cpsw_read_4(sc, CPSW_SL_SOFT_RESET(i)) & 1) ; } /* Reset DMA controller. */ cpsw_write_4(sc, CPSW_CPDMA_SOFT_RESET, 1); while (cpsw_read_4(sc, CPSW_CPDMA_SOFT_RESET) & 1) ; /* Disable TX & RX DMA */ cpsw_write_4(sc, CPSW_CPDMA_TX_CONTROL, 0); cpsw_write_4(sc, CPSW_CPDMA_RX_CONTROL, 0); /* Clear all queues. */ for (i = 0; i < 8; i++) { cpsw_write_4(sc, CPSW_CPDMA_TX_HDP(i), 0); cpsw_write_4(sc, CPSW_CPDMA_RX_HDP(i), 0); cpsw_write_4(sc, CPSW_CPDMA_TX_CP(i), 0); cpsw_write_4(sc, CPSW_CPDMA_RX_CP(i), 0); } /* Clear all interrupt Masks */ cpsw_write_4(sc, CPSW_CPDMA_RX_INTMASK_CLEAR, 0xFFFFFFFF); cpsw_write_4(sc, CPSW_CPDMA_TX_INTMASK_CLEAR, 0xFFFFFFFF); } static void cpsw_init(struct cpsw_softc *sc) { struct cpsw_slot *slot; uint32_t reg; /* Disable the interrupt pacing. */ reg = cpsw_read_4(sc, CPSW_WR_INT_CONTROL); reg &= ~(CPSW_WR_INT_PACE_EN | CPSW_WR_INT_PRESCALE_MASK); cpsw_write_4(sc, CPSW_WR_INT_CONTROL, reg); /* Clear ALE */ cpsw_write_4(sc, CPSW_ALE_CONTROL, CPSW_ALE_CTL_CLEAR_TBL); /* Enable ALE */ reg = CPSW_ALE_CTL_ENABLE; if (sc->dualemac) reg |= CPSW_ALE_CTL_VLAN_AWARE; cpsw_write_4(sc, CPSW_ALE_CONTROL, reg); /* Set Host Port Mapping. */ cpsw_write_4(sc, CPSW_PORT_P0_CPDMA_TX_PRI_MAP, 0x76543210); cpsw_write_4(sc, CPSW_PORT_P0_CPDMA_RX_CH_MAP, 0); /* Initialize ALE: set host port to forwarding(3). */ cpsw_write_4(sc, CPSW_ALE_PORTCTL(0), ALE_PORTCTL_INGRESS | ALE_PORTCTL_FORWARD); cpsw_write_4(sc, CPSW_SS_PTYPE, 0); /* Enable statistics for ports 0, 1 and 2 */ cpsw_write_4(sc, CPSW_SS_STAT_PORT_EN, 7); /* Turn off flow control. */ cpsw_write_4(sc, CPSW_SS_FLOW_CONTROL, 0); /* Make IP hdr aligned with 4 */ cpsw_write_4(sc, CPSW_CPDMA_RX_BUFFER_OFFSET, 2); /* Initialize RX Buffer Descriptors */ cpsw_write_4(sc, CPSW_CPDMA_RX_PENDTHRESH(0), 0); cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), 0); /* Enable TX & RX DMA */ cpsw_write_4(sc, CPSW_CPDMA_TX_CONTROL, 1); cpsw_write_4(sc, CPSW_CPDMA_RX_CONTROL, 1); /* Enable Interrupts for core 0 */ cpsw_write_4(sc, CPSW_WR_C_RX_THRESH_EN(0), 0xFF); cpsw_write_4(sc, CPSW_WR_C_RX_EN(0), 0xFF); cpsw_write_4(sc, CPSW_WR_C_TX_EN(0), 0xFF); cpsw_write_4(sc, CPSW_WR_C_MISC_EN(0), 0x1F); /* Enable host Error Interrupt */ cpsw_write_4(sc, CPSW_CPDMA_DMA_INTMASK_SET, 3); /* Enable interrupts for RX and TX on Channel 0 */ cpsw_write_4(sc, CPSW_CPDMA_RX_INTMASK_SET, CPSW_CPDMA_RX_INT(0) | CPSW_CPDMA_RX_INT_THRESH(0)); cpsw_write_4(sc, CPSW_CPDMA_TX_INTMASK_SET, 1); /* Initialze MDIO - ENABLE, PREAMBLE=0, FAULTENB, CLKDIV=0xFF */ /* TODO Calculate MDCLK=CLK/(CLKDIV+1) */ cpsw_write_4(sc, MDIOCONTROL, MDIOCTL_ENABLE | MDIOCTL_FAULTENB | 0xff); /* Select MII in GMII_SEL, Internal Delay mode */ //ti_scm_reg_write_4(0x650, 0); /* Initialize active queues. */ slot = STAILQ_FIRST(&sc->tx.active); if (slot != NULL) cpsw_write_hdp_slot(sc, &sc->tx, slot); slot = STAILQ_FIRST(&sc->rx.active); if (slot != NULL) cpsw_write_hdp_slot(sc, &sc->rx, slot); cpsw_rx_enqueue(sc); cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), sc->rx.active_queue_len); cpsw_write_4(sc, CPSW_CPDMA_RX_PENDTHRESH(0), CPSW_TXFRAGS); /* Activate network interface. */ sc->rx.running = 1; sc->tx.running = 1; sc->watchdog.timer = 0; callout_init(&sc->watchdog.callout, 0); callout_reset(&sc->watchdog.callout, hz, cpsw_tx_watchdog, sc); } /* * * Device Probe, Attach, Detach. * */ static int cpsw_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "ti,cpsw")) return (ENXIO); device_set_desc(dev, "3-port Switch Ethernet Subsystem"); return (BUS_PROBE_DEFAULT); } static int cpsw_intr_attach(struct cpsw_softc *sc) { int i; for (i = 0; i < CPSW_INTR_COUNT; i++) { if (bus_setup_intr(sc->dev, sc->irq_res[i], INTR_TYPE_NET | INTR_MPSAFE, NULL, cpsw_intr_cb[i].cb, sc, &sc->ih_cookie[i]) != 0) { return (-1); } } return (0); } static void cpsw_intr_detach(struct cpsw_softc *sc) { int i; for (i = 0; i < CPSW_INTR_COUNT; i++) { if (sc->ih_cookie[i]) { bus_teardown_intr(sc->dev, sc->irq_res[i], sc->ih_cookie[i]); } } } static int cpsw_get_fdt_data(struct cpsw_softc *sc, int port) { char *name; int len, phy, vlan; pcell_t phy_id[3], vlan_id; phandle_t child; unsigned long mdio_child_addr; /* Find any slave with phy-handle/phy_id */ phy = -1; vlan = -1; for (child = OF_child(sc->node); child != 0; child = OF_peer(child)) { if (OF_getprop_alloc(child, "name", (void **)&name) < 0) continue; if (sscanf(name, "slave@%lx", &mdio_child_addr) != 1) { OF_prop_free(name); continue; } OF_prop_free(name); - if (mdio_child_addr != slave_mdio_addr[port]) + + if (mdio_child_addr != slave_mdio_addr[port] && + mdio_child_addr != (slave_mdio_addr[port] & 0xFFF)) continue; if (fdt_get_phyaddr(child, NULL, &phy, NULL) != 0){ /* Users with old DTB will have phy_id instead */ phy = -1; len = OF_getproplen(child, "phy_id"); if (len / sizeof(pcell_t) == 2) { /* Get phy address from fdt */ if (OF_getencprop(child, "phy_id", phy_id, len) > 0) phy = phy_id[1]; } } len = OF_getproplen(child, "dual_emac_res_vlan"); if (len / sizeof(pcell_t) == 1) { /* Get phy address from fdt */ if (OF_getencprop(child, "dual_emac_res_vlan", &vlan_id, len) > 0) { vlan = vlan_id; } } break; } if (phy == -1) return (ENXIO); sc->port[port].phy = phy; sc->port[port].vlan = vlan; return (0); } static int cpsw_attach(device_t dev) { int error, i; struct cpsw_softc *sc; uint32_t reg; sc = device_get_softc(dev); sc->dev = dev; sc->node = ofw_bus_get_node(dev); getbinuptime(&sc->attach_uptime); if (OF_getencprop(sc->node, "active_slave", &sc->active_slave, sizeof(sc->active_slave)) <= 0) { sc->active_slave = 0; } if (sc->active_slave > 1) sc->active_slave = 1; if (OF_hasprop(sc->node, "dual_emac")) sc->dualemac = 1; for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; if (cpsw_get_fdt_data(sc, i) != 0) { device_printf(dev, "failed to get PHY address from FDT\n"); return (ENXIO); } } /* Initialize mutexes */ mtx_init(&sc->tx.lock, device_get_nameunit(dev), "cpsw TX lock", MTX_DEF); mtx_init(&sc->rx.lock, device_get_nameunit(dev), "cpsw RX lock", MTX_DEF); /* Allocate IRQ resources */ error = bus_alloc_resources(dev, irq_res_spec, sc->irq_res); if (error) { device_printf(dev, "could not allocate IRQ resources\n"); cpsw_detach(dev); return (ENXIO); } sc->mem_rid = 0; sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->mem_rid, RF_ACTIVE); if (sc->mem_res == NULL) { device_printf(sc->dev, "failed to allocate memory resource\n"); cpsw_detach(dev); return (ENXIO); } reg = cpsw_read_4(sc, CPSW_SS_IDVER); device_printf(dev, "CPSW SS Version %d.%d (%d)\n", (reg >> 8 & 0x7), reg & 0xFF, (reg >> 11) & 0x1F); cpsw_add_sysctls(sc); /* Allocate a busdma tag and DMA safe memory for mbufs. */ error = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* parent */ 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filtfunc, filtfuncarg */ MCLBYTES, CPSW_TXFRAGS, /* maxsize, nsegments */ MCLBYTES, 0, /* maxsegsz, flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sc->mbuf_dtag); /* dmatag */ if (error) { device_printf(dev, "bus_dma_tag_create failed\n"); cpsw_detach(dev); return (error); } /* Allocate a NULL buffer for padding. */ sc->nullpad = malloc(ETHER_MIN_LEN, M_DEVBUF, M_WAITOK | M_ZERO); cpsw_init_slots(sc); /* Allocate slots to TX and RX queues. */ STAILQ_INIT(&sc->rx.avail); STAILQ_INIT(&sc->rx.active); STAILQ_INIT(&sc->tx.avail); STAILQ_INIT(&sc->tx.active); // For now: 128 slots to TX, rest to RX. // XXX TODO: start with 32/64 and grow dynamically based on demand. if (cpsw_add_slots(sc, &sc->tx, 128) || cpsw_add_slots(sc, &sc->rx, -1)) { device_printf(dev, "failed to allocate dmamaps\n"); cpsw_detach(dev); return (ENOMEM); } device_printf(dev, "Initial queue size TX=%d RX=%d\n", sc->tx.queue_slots, sc->rx.queue_slots); sc->tx.hdp_offset = CPSW_CPDMA_TX_HDP(0); sc->rx.hdp_offset = CPSW_CPDMA_RX_HDP(0); if (cpsw_intr_attach(sc) == -1) { device_printf(dev, "failed to setup interrupts\n"); cpsw_detach(dev); return (ENXIO); } #ifdef CPSW_ETHERSWITCH for (i = 0; i < CPSW_VLANS; i++) cpsw_vgroups[i].vid = -1; #endif /* Reset the controller. */ cpsw_reset(sc); cpsw_init(sc); for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; sc->port[i].dev = device_add_child(dev, "cpsw", i); if (sc->port[i].dev == NULL) { cpsw_detach(dev); return (ENXIO); } } bus_generic_probe(dev); bus_generic_attach(dev); return (0); } static int cpsw_detach(device_t dev) { struct cpsw_softc *sc; int error, i; bus_generic_detach(dev); sc = device_get_softc(dev); for (i = 0; i < CPSW_PORTS; i++) { if (sc->port[i].dev) device_delete_child(dev, sc->port[i].dev); } if (device_is_attached(dev)) { callout_stop(&sc->watchdog.callout); callout_drain(&sc->watchdog.callout); } /* Stop and release all interrupts */ cpsw_intr_detach(sc); /* Free dmamaps and mbufs */ for (i = 0; i < nitems(sc->_slots); ++i) cpsw_free_slot(sc, &sc->_slots[i]); /* Free null padding buffer. */ if (sc->nullpad) free(sc->nullpad, M_DEVBUF); /* Free DMA tag */ if (sc->mbuf_dtag) { error = bus_dma_tag_destroy(sc->mbuf_dtag); KASSERT(error == 0, ("Unable to destroy DMA tag")); } /* Free IO memory handler */ if (sc->mem_res != NULL) bus_release_resource(dev, SYS_RES_MEMORY, sc->mem_rid, sc->mem_res); bus_release_resources(dev, irq_res_spec, sc->irq_res); /* Destroy mutexes */ mtx_destroy(&sc->rx.lock); mtx_destroy(&sc->tx.lock); /* Detach the switch device, if present. */ error = bus_generic_detach(dev); if (error != 0) return (error); return (device_delete_children(dev)); } static phandle_t cpsw_get_node(device_t bus, device_t dev) { /* Share controller node with port device. */ return (ofw_bus_get_node(bus)); } static int cpswp_probe(device_t dev) { if (device_get_unit(dev) > 1) { device_printf(dev, "Only two ports are supported.\n"); return (ENXIO); } device_set_desc(dev, "Ethernet Switch Port"); return (BUS_PROBE_DEFAULT); } static int cpswp_attach(device_t dev) { int error; struct ifnet *ifp; struct cpswp_softc *sc; uint32_t reg; uint8_t mac_addr[ETHER_ADDR_LEN]; sc = device_get_softc(dev); sc->dev = dev; sc->pdev = device_get_parent(dev); sc->swsc = device_get_softc(sc->pdev); sc->unit = device_get_unit(dev); sc->phy = sc->swsc->port[sc->unit].phy; sc->vlan = sc->swsc->port[sc->unit].vlan; if (sc->swsc->dualemac && sc->vlan == -1) sc->vlan = sc->unit + 1; if (sc->unit == 0) { sc->physel = MDIOUSERPHYSEL0; sc->phyaccess = MDIOUSERACCESS0; } else { sc->physel = MDIOUSERPHYSEL1; sc->phyaccess = MDIOUSERACCESS1; } mtx_init(&sc->lock, device_get_nameunit(dev), "cpsw port lock", MTX_DEF); /* Allocate network interface */ ifp = sc->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { cpswp_detach(dev); return (ENXIO); } if_initname(ifp, device_get_name(sc->dev), sc->unit); ifp->if_softc = sc; ifp->if_flags = IFF_SIMPLEX | IFF_MULTICAST | IFF_BROADCAST; ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_HWCSUM; //FIXME VLAN? ifp->if_capenable = ifp->if_capabilities; ifp->if_init = cpswp_init; ifp->if_start = cpswp_start; ifp->if_ioctl = cpswp_ioctl; ifp->if_snd.ifq_drv_maxlen = sc->swsc->tx.queue_slots; IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen); IFQ_SET_READY(&ifp->if_snd); /* Get high part of MAC address from control module (mac_id[0|1]_hi) */ ti_scm_reg_read_4(SCM_MAC_ID0_HI + sc->unit * 8, ®); mac_addr[0] = reg & 0xFF; mac_addr[1] = (reg >> 8) & 0xFF; mac_addr[2] = (reg >> 16) & 0xFF; mac_addr[3] = (reg >> 24) & 0xFF; /* Get low part of MAC address from control module (mac_id[0|1]_lo) */ ti_scm_reg_read_4(SCM_MAC_ID0_LO + sc->unit * 8, ®); mac_addr[4] = reg & 0xFF; mac_addr[5] = (reg >> 8) & 0xFF; error = mii_attach(dev, &sc->miibus, ifp, cpswp_ifmedia_upd, cpswp_ifmedia_sts, BMSR_DEFCAPMASK, sc->phy, MII_OFFSET_ANY, 0); if (error) { device_printf(dev, "attaching PHYs failed\n"); cpswp_detach(dev); return (error); } sc->mii = device_get_softc(sc->miibus); /* Select PHY and enable interrupts */ cpsw_write_4(sc->swsc, sc->physel, MDIO_PHYSEL_LINKINTENB | (sc->phy & 0x1F)); ether_ifattach(sc->ifp, mac_addr); callout_init(&sc->mii_callout, 0); return (0); } static int cpswp_detach(device_t dev) { struct cpswp_softc *sc; sc = device_get_softc(dev); CPSW_DEBUGF(sc->swsc, ("")); if (device_is_attached(dev)) { ether_ifdetach(sc->ifp); CPSW_PORT_LOCK(sc); cpswp_stop_locked(sc); CPSW_PORT_UNLOCK(sc); callout_drain(&sc->mii_callout); } bus_generic_detach(dev); if_free(sc->ifp); mtx_destroy(&sc->lock); return (0); } /* * * Init/Shutdown. * */ static int cpsw_ports_down(struct cpsw_softc *sc) { struct cpswp_softc *psc; struct ifnet *ifp1, *ifp2; if (!sc->dualemac) return (1); psc = device_get_softc(sc->port[0].dev); ifp1 = psc->ifp; psc = device_get_softc(sc->port[1].dev); ifp2 = psc->ifp; if ((ifp1->if_flags & IFF_UP) == 0 && (ifp2->if_flags & IFF_UP) == 0) return (1); return (0); } static void cpswp_init(void *arg) { struct cpswp_softc *sc = arg; CPSW_DEBUGF(sc->swsc, ("")); CPSW_PORT_LOCK(sc); cpswp_init_locked(arg); CPSW_PORT_UNLOCK(sc); } static void cpswp_init_locked(void *arg) { #ifdef CPSW_ETHERSWITCH int i; #endif struct cpswp_softc *sc = arg; struct ifnet *ifp; uint32_t reg; CPSW_DEBUGF(sc->swsc, ("")); CPSW_PORT_LOCK_ASSERT(sc); ifp = sc->ifp; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) return; getbinuptime(&sc->init_uptime); if (!sc->swsc->rx.running && !sc->swsc->tx.running) { /* Reset the controller. */ cpsw_reset(sc->swsc); cpsw_init(sc->swsc); } /* Set Slave Mapping. */ cpsw_write_4(sc->swsc, CPSW_SL_RX_PRI_MAP(sc->unit), 0x76543210); cpsw_write_4(sc->swsc, CPSW_PORT_P_TX_PRI_MAP(sc->unit + 1), 0x33221100); cpsw_write_4(sc->swsc, CPSW_SL_RX_MAXLEN(sc->unit), 0x5f2); /* Enable MAC RX/TX modules. */ /* TODO: Docs claim that IFCTL_B and IFCTL_A do the same thing? */ /* Huh? Docs call bit 0 "Loopback" some places, "FullDuplex" others. */ reg = cpsw_read_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit)); reg |= CPSW_SL_MACTL_GMII_ENABLE; cpsw_write_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit), reg); /* Initialize ALE: set port to forwarding, initialize addrs */ cpsw_write_4(sc->swsc, CPSW_ALE_PORTCTL(sc->unit + 1), ALE_PORTCTL_INGRESS | ALE_PORTCTL_FORWARD); cpswp_ale_update_addresses(sc, 1); if (sc->swsc->dualemac) { /* Set Port VID. */ cpsw_write_4(sc->swsc, CPSW_PORT_P_VLAN(sc->unit + 1), sc->vlan & 0xfff); cpsw_ale_update_vlan_table(sc->swsc, sc->vlan, (1 << (sc->unit + 1)) | (1 << 0), /* Member list */ (1 << (sc->unit + 1)) | (1 << 0), /* Untagged egress */ (1 << (sc->unit + 1)) | (1 << 0), 0); /* mcast reg flood */ #ifdef CPSW_ETHERSWITCH for (i = 0; i < CPSW_VLANS; i++) { if (cpsw_vgroups[i].vid != -1) continue; cpsw_vgroups[i].vid = sc->vlan; break; } #endif } mii_mediachg(sc->mii); callout_reset(&sc->mii_callout, hz, cpswp_tick, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } static int cpsw_shutdown(device_t dev) { struct cpsw_softc *sc; struct cpswp_softc *psc; int i; sc = device_get_softc(dev); CPSW_DEBUGF(sc, ("")); for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; psc = device_get_softc(sc->port[i].dev); CPSW_PORT_LOCK(psc); cpswp_stop_locked(psc); CPSW_PORT_UNLOCK(psc); } return (0); } static void cpsw_rx_teardown(struct cpsw_softc *sc) { int i = 0; CPSW_RX_LOCK(sc); CPSW_DEBUGF(sc, ("starting RX teardown")); sc->rx.teardown = 1; cpsw_write_4(sc, CPSW_CPDMA_RX_TEARDOWN, 0); CPSW_RX_UNLOCK(sc); while (sc->rx.running) { if (++i > 10) { device_printf(sc->dev, "Unable to cleanly shutdown receiver\n"); return; } DELAY(200); } if (!sc->rx.running) CPSW_DEBUGF(sc, ("finished RX teardown (%d retries)", i)); } static void cpsw_tx_teardown(struct cpsw_softc *sc) { int i = 0; CPSW_TX_LOCK(sc); CPSW_DEBUGF(sc, ("starting TX teardown")); /* Start the TX queue teardown if queue is not empty. */ if (STAILQ_FIRST(&sc->tx.active) != NULL) cpsw_write_4(sc, CPSW_CPDMA_TX_TEARDOWN, 0); else sc->tx.teardown = 1; cpsw_tx_dequeue(sc); while (sc->tx.running && ++i < 10) { DELAY(200); cpsw_tx_dequeue(sc); } if (sc->tx.running) { device_printf(sc->dev, "Unable to cleanly shutdown transmitter\n"); } CPSW_DEBUGF(sc, ("finished TX teardown (%d retries, %d idle buffers)", i, sc->tx.active_queue_len)); CPSW_TX_UNLOCK(sc); } static void cpswp_stop_locked(struct cpswp_softc *sc) { struct ifnet *ifp; uint32_t reg; ifp = sc->ifp; CPSW_DEBUGF(sc->swsc, ("")); CPSW_PORT_LOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; /* Disable interface */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* Stop ticker */ callout_stop(&sc->mii_callout); /* Tear down the RX/TX queues. */ if (cpsw_ports_down(sc->swsc)) { cpsw_rx_teardown(sc->swsc); cpsw_tx_teardown(sc->swsc); } /* Stop MAC RX/TX modules. */ reg = cpsw_read_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit)); reg &= ~CPSW_SL_MACTL_GMII_ENABLE; cpsw_write_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit), reg); if (cpsw_ports_down(sc->swsc)) { /* Capture stats before we reset controller. */ cpsw_stats_collect(sc->swsc); cpsw_reset(sc->swsc); cpsw_init(sc->swsc); } } /* * Suspend/Resume. */ static int cpsw_suspend(device_t dev) { struct cpsw_softc *sc; struct cpswp_softc *psc; int i; sc = device_get_softc(dev); CPSW_DEBUGF(sc, ("")); for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; psc = device_get_softc(sc->port[i].dev); CPSW_PORT_LOCK(psc); cpswp_stop_locked(psc); CPSW_PORT_UNLOCK(psc); } return (0); } static int cpsw_resume(device_t dev) { struct cpsw_softc *sc; sc = device_get_softc(dev); CPSW_DEBUGF(sc, ("UNIMPLEMENTED")); return (0); } /* * * IOCTL * */ static void cpsw_set_promisc(struct cpswp_softc *sc, int set) { uint32_t reg; /* * Enabling promiscuous mode requires ALE_BYPASS to be enabled. * That disables the ALE forwarding logic and causes every * packet to be sent only to the host port. In bypass mode, * the ALE processes host port transmit packets the same as in * normal mode. */ reg = cpsw_read_4(sc->swsc, CPSW_ALE_CONTROL); reg &= ~CPSW_ALE_CTL_BYPASS; if (set) reg |= CPSW_ALE_CTL_BYPASS; cpsw_write_4(sc->swsc, CPSW_ALE_CONTROL, reg); } static void cpsw_set_allmulti(struct cpswp_softc *sc, int set) { if (set) { printf("All-multicast mode unimplemented\n"); } } static int cpswp_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct cpswp_softc *sc; struct ifreq *ifr; int error; uint32_t changed; error = 0; sc = ifp->if_softc; ifr = (struct ifreq *)data; switch (command) { case SIOCSIFCAP: changed = ifp->if_capenable ^ ifr->ifr_reqcap; if (changed & IFCAP_HWCSUM) { if ((ifr->ifr_reqcap & changed) & IFCAP_HWCSUM) ifp->if_capenable |= IFCAP_HWCSUM; else ifp->if_capenable &= ~IFCAP_HWCSUM; } error = 0; break; case SIOCSIFFLAGS: CPSW_PORT_LOCK(sc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { changed = ifp->if_flags ^ sc->if_flags; CPSW_DEBUGF(sc->swsc, ("SIOCSIFFLAGS: UP & RUNNING (changed=0x%x)", changed)); if (changed & IFF_PROMISC) cpsw_set_promisc(sc, ifp->if_flags & IFF_PROMISC); if (changed & IFF_ALLMULTI) cpsw_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI); } else { CPSW_DEBUGF(sc->swsc, ("SIOCSIFFLAGS: starting up")); cpswp_init_locked(sc); } } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { CPSW_DEBUGF(sc->swsc, ("SIOCSIFFLAGS: shutting down")); cpswp_stop_locked(sc); } sc->if_flags = ifp->if_flags; CPSW_PORT_UNLOCK(sc); break; case SIOCADDMULTI: cpswp_ale_update_addresses(sc, 0); break; case SIOCDELMULTI: /* Ugh. DELMULTI doesn't provide the specific address being removed, so the best we can do is remove everything and rebuild it all. */ cpswp_ale_update_addresses(sc, 1); break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->mii->mii_media, command); break; default: error = ether_ioctl(ifp, command, data); } return (error); } /* * * MIIBUS * */ static int cpswp_miibus_ready(struct cpsw_softc *sc, uint32_t reg) { uint32_t r, retries = CPSW_MIIBUS_RETRIES; while (--retries) { r = cpsw_read_4(sc, reg); if ((r & MDIO_PHYACCESS_GO) == 0) return (1); DELAY(CPSW_MIIBUS_DELAY); } return (0); } static int cpswp_miibus_readreg(device_t dev, int phy, int reg) { struct cpswp_softc *sc; uint32_t cmd, r; sc = device_get_softc(dev); if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) { device_printf(dev, "MDIO not ready to read\n"); return (0); } /* Set GO, reg, phy */ cmd = MDIO_PHYACCESS_GO | (reg & 0x1F) << 21 | (phy & 0x1F) << 16; cpsw_write_4(sc->swsc, sc->phyaccess, cmd); if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) { device_printf(dev, "MDIO timed out during read\n"); return (0); } r = cpsw_read_4(sc->swsc, sc->phyaccess); if ((r & MDIO_PHYACCESS_ACK) == 0) { device_printf(dev, "Failed to read from PHY.\n"); r = 0; } return (r & 0xFFFF); } static int cpswp_miibus_writereg(device_t dev, int phy, int reg, int value) { struct cpswp_softc *sc; uint32_t cmd; sc = device_get_softc(dev); if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) { device_printf(dev, "MDIO not ready to write\n"); return (0); } /* Set GO, WRITE, reg, phy, and value */ cmd = MDIO_PHYACCESS_GO | MDIO_PHYACCESS_WRITE | (reg & 0x1F) << 21 | (phy & 0x1F) << 16 | (value & 0xFFFF); cpsw_write_4(sc->swsc, sc->phyaccess, cmd); if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) { device_printf(dev, "MDIO timed out during write\n"); return (0); } return (0); } static void cpswp_miibus_statchg(device_t dev) { struct cpswp_softc *sc; uint32_t mac_control, reg; sc = device_get_softc(dev); CPSW_DEBUGF(sc->swsc, ("")); reg = CPSW_SL_MACCONTROL(sc->unit); mac_control = cpsw_read_4(sc->swsc, reg); mac_control &= ~(CPSW_SL_MACTL_GIG | CPSW_SL_MACTL_IFCTL_A | CPSW_SL_MACTL_IFCTL_B | CPSW_SL_MACTL_FULLDUPLEX); switch(IFM_SUBTYPE(sc->mii->mii_media_active)) { case IFM_1000_SX: case IFM_1000_LX: case IFM_1000_CX: case IFM_1000_T: mac_control |= CPSW_SL_MACTL_GIG; break; case IFM_100_TX: mac_control |= CPSW_SL_MACTL_IFCTL_A; break; } if (sc->mii->mii_media_active & IFM_FDX) mac_control |= CPSW_SL_MACTL_FULLDUPLEX; cpsw_write_4(sc->swsc, reg, mac_control); } /* * * Transmit/Receive Packets. * */ static void cpsw_intr_rx(void *arg) { struct cpsw_softc *sc; struct ifnet *ifp; struct mbuf *received, *next; sc = (struct cpsw_softc *)arg; CPSW_RX_LOCK(sc); if (sc->rx.teardown) { sc->rx.running = 0; sc->rx.teardown = 0; cpsw_write_cp(sc, &sc->rx, 0xfffffffc); } received = cpsw_rx_dequeue(sc); cpsw_rx_enqueue(sc); cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 1); CPSW_RX_UNLOCK(sc); while (received != NULL) { next = received->m_nextpkt; received->m_nextpkt = NULL; ifp = received->m_pkthdr.rcvif; (*ifp->if_input)(ifp, received); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); received = next; } } static struct mbuf * cpsw_rx_dequeue(struct cpsw_softc *sc) { int nsegs, port, removed; struct cpsw_cpdma_bd bd; struct cpsw_slot *last, *slot; struct cpswp_softc *psc; struct mbuf *m, *m0, *mb_head, *mb_tail; uint16_t m0_flags; nsegs = 0; m0 = NULL; last = NULL; mb_head = NULL; mb_tail = NULL; removed = 0; /* Pull completed packets off hardware RX queue. */ while ((slot = STAILQ_FIRST(&sc->rx.active)) != NULL) { cpsw_cpdma_read_bd(sc, slot, &bd); /* * Stop on packets still in use by hardware, but do not stop * on packets with the teardown complete flag, they will be * discarded later. */ if ((bd.flags & (CPDMA_BD_OWNER | CPDMA_BD_TDOWNCMPLT)) == CPDMA_BD_OWNER) break; last = slot; ++removed; STAILQ_REMOVE_HEAD(&sc->rx.active, next); STAILQ_INSERT_TAIL(&sc->rx.avail, slot, next); bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap); m = slot->mbuf; slot->mbuf = NULL; if (bd.flags & CPDMA_BD_TDOWNCMPLT) { CPSW_DEBUGF(sc, ("RX teardown is complete")); m_freem(m); sc->rx.running = 0; sc->rx.teardown = 0; break; } port = (bd.flags & CPDMA_BD_PORT_MASK) - 1; KASSERT(port >= 0 && port <= 1, ("patcket received with invalid port: %d", port)); psc = device_get_softc(sc->port[port].dev); /* Set up mbuf */ m->m_data += bd.bufoff; m->m_len = bd.buflen; if (bd.flags & CPDMA_BD_SOP) { m->m_pkthdr.len = bd.pktlen; m->m_pkthdr.rcvif = psc->ifp; m->m_flags |= M_PKTHDR; m0_flags = bd.flags; m0 = m; } nsegs++; m->m_next = NULL; m->m_nextpkt = NULL; if (bd.flags & CPDMA_BD_EOP && m0 != NULL) { if (m0_flags & CPDMA_BD_PASS_CRC) m_adj(m0, -ETHER_CRC_LEN); m0_flags = 0; m0 = NULL; if (nsegs > sc->rx.longest_chain) sc->rx.longest_chain = nsegs; nsegs = 0; } if ((psc->ifp->if_capenable & IFCAP_RXCSUM) != 0) { /* check for valid CRC by looking into pkt_err[5:4] */ if ((bd.flags & (CPDMA_BD_SOP | CPDMA_BD_PKT_ERR_MASK)) == CPDMA_BD_SOP) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; m->m_pkthdr.csum_flags |= CSUM_IP_VALID; m->m_pkthdr.csum_data = 0xffff; } } if (STAILQ_FIRST(&sc->rx.active) != NULL && (bd.flags & (CPDMA_BD_EOP | CPDMA_BD_EOQ)) == (CPDMA_BD_EOP | CPDMA_BD_EOQ)) { cpsw_write_hdp_slot(sc, &sc->rx, STAILQ_FIRST(&sc->rx.active)); sc->rx.queue_restart++; } /* Add mbuf to packet list to be returned. */ if (mb_tail != NULL && (bd.flags & CPDMA_BD_SOP)) { mb_tail->m_nextpkt = m; } else if (mb_tail != NULL) { mb_tail->m_next = m; } else if (mb_tail == NULL && (bd.flags & CPDMA_BD_SOP) == 0) { if (bootverbose) printf( "%s: %s: discanding fragment packet w/o header\n", __func__, psc->ifp->if_xname); m_freem(m); continue; } else { mb_head = m; } mb_tail = m; } if (removed != 0) { cpsw_write_cp_slot(sc, &sc->rx, last); sc->rx.queue_removes += removed; sc->rx.avail_queue_len += removed; sc->rx.active_queue_len -= removed; if (sc->rx.avail_queue_len > sc->rx.max_avail_queue_len) sc->rx.max_avail_queue_len = sc->rx.avail_queue_len; CPSW_DEBUGF(sc, ("Removed %d received packet(s) from RX queue", removed)); } return (mb_head); } static void cpsw_rx_enqueue(struct cpsw_softc *sc) { bus_dma_segment_t seg[1]; struct cpsw_cpdma_bd bd; struct cpsw_slot *first_new_slot, *last_old_slot, *next, *slot; int error, nsegs, added = 0; /* Register new mbufs with hardware. */ first_new_slot = NULL; last_old_slot = STAILQ_LAST(&sc->rx.active, cpsw_slot, next); while ((slot = STAILQ_FIRST(&sc->rx.avail)) != NULL) { if (first_new_slot == NULL) first_new_slot = slot; if (slot->mbuf == NULL) { slot->mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (slot->mbuf == NULL) { device_printf(sc->dev, "Unable to fill RX queue\n"); break; } slot->mbuf->m_len = slot->mbuf->m_pkthdr.len = slot->mbuf->m_ext.ext_size; } error = bus_dmamap_load_mbuf_sg(sc->mbuf_dtag, slot->dmamap, slot->mbuf, seg, &nsegs, BUS_DMA_NOWAIT); KASSERT(nsegs == 1, ("More than one segment (nsegs=%d)", nsegs)); KASSERT(error == 0, ("DMA error (error=%d)", error)); if (error != 0 || nsegs != 1) { device_printf(sc->dev, "%s: Can't prep RX buf for DMA (nsegs=%d, error=%d)\n", __func__, nsegs, error); bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap); m_freem(slot->mbuf); slot->mbuf = NULL; break; } bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_PREREAD); /* Create and submit new rx descriptor. */ if ((next = STAILQ_NEXT(slot, next)) != NULL) bd.next = cpsw_cpdma_bd_paddr(sc, next); else bd.next = 0; bd.bufptr = seg->ds_addr; bd.bufoff = 0; bd.buflen = MCLBYTES - 1; bd.pktlen = bd.buflen; bd.flags = CPDMA_BD_OWNER; cpsw_cpdma_write_bd(sc, slot, &bd); ++added; STAILQ_REMOVE_HEAD(&sc->rx.avail, next); STAILQ_INSERT_TAIL(&sc->rx.active, slot, next); } if (added == 0 || first_new_slot == NULL) return; CPSW_DEBUGF(sc, ("Adding %d buffers to RX queue", added)); /* Link new entries to hardware RX queue. */ if (last_old_slot == NULL) { /* Start a fresh queue. */ cpsw_write_hdp_slot(sc, &sc->rx, first_new_slot); } else { /* Add buffers to end of current queue. */ cpsw_cpdma_write_bd_next(sc, last_old_slot, first_new_slot); } sc->rx.queue_adds += added; sc->rx.avail_queue_len -= added; sc->rx.active_queue_len += added; cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), added); if (sc->rx.active_queue_len > sc->rx.max_active_queue_len) sc->rx.max_active_queue_len = sc->rx.active_queue_len; } static void cpswp_start(struct ifnet *ifp) { struct cpswp_softc *sc; sc = ifp->if_softc; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || sc->swsc->tx.running == 0) { return; } CPSW_TX_LOCK(sc->swsc); cpswp_tx_enqueue(sc); cpsw_tx_dequeue(sc->swsc); CPSW_TX_UNLOCK(sc->swsc); } static void cpsw_intr_tx(void *arg) { struct cpsw_softc *sc; sc = (struct cpsw_softc *)arg; CPSW_TX_LOCK(sc); if (cpsw_read_4(sc, CPSW_CPDMA_TX_CP(0)) == 0xfffffffc) cpsw_write_cp(sc, &sc->tx, 0xfffffffc); cpsw_tx_dequeue(sc); cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 2); CPSW_TX_UNLOCK(sc); } static void cpswp_tx_enqueue(struct cpswp_softc *sc) { bus_dma_segment_t segs[CPSW_TXFRAGS]; struct cpsw_cpdma_bd bd; struct cpsw_slot *first_new_slot, *last, *last_old_slot, *next, *slot; struct mbuf *m0; int error, nsegs, seg, added = 0, padlen; /* Pull pending packets from IF queue and prep them for DMA. */ last = NULL; first_new_slot = NULL; last_old_slot = STAILQ_LAST(&sc->swsc->tx.active, cpsw_slot, next); while ((slot = STAILQ_FIRST(&sc->swsc->tx.avail)) != NULL) { IF_DEQUEUE(&sc->ifp->if_snd, m0); if (m0 == NULL) break; slot->mbuf = m0; padlen = ETHER_MIN_LEN - ETHER_CRC_LEN - m0->m_pkthdr.len; if (padlen < 0) padlen = 0; else if (padlen > 0) m_append(slot->mbuf, padlen, sc->swsc->nullpad); /* Create mapping in DMA memory */ error = bus_dmamap_load_mbuf_sg(sc->swsc->mbuf_dtag, slot->dmamap, slot->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); /* If the packet is too fragmented, try to simplify. */ if (error == EFBIG || (error == 0 && nsegs > sc->swsc->tx.avail_queue_len)) { bus_dmamap_unload(sc->swsc->mbuf_dtag, slot->dmamap); m0 = m_defrag(slot->mbuf, M_NOWAIT); if (m0 == NULL) { device_printf(sc->dev, "Can't defragment packet; dropping\n"); m_freem(slot->mbuf); } else { CPSW_DEBUGF(sc->swsc, ("Requeueing defragmented packet")); IF_PREPEND(&sc->ifp->if_snd, m0); } slot->mbuf = NULL; continue; } if (error != 0) { device_printf(sc->dev, "%s: Can't setup DMA (error=%d), dropping packet\n", __func__, error); bus_dmamap_unload(sc->swsc->mbuf_dtag, slot->dmamap); m_freem(slot->mbuf); slot->mbuf = NULL; break; } bus_dmamap_sync(sc->swsc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_PREWRITE); CPSW_DEBUGF(sc->swsc, ("Queueing TX packet: %d segments + %d pad bytes", nsegs, padlen)); if (first_new_slot == NULL) first_new_slot = slot; /* Link from the previous descriptor. */ if (last != NULL) cpsw_cpdma_write_bd_next(sc->swsc, last, slot); slot->ifp = sc->ifp; /* If there is only one segment, the for() loop * gets skipped and the single buffer gets set up * as both SOP and EOP. */ if (nsegs > 1) { next = STAILQ_NEXT(slot, next); bd.next = cpsw_cpdma_bd_paddr(sc->swsc, next); } else bd.next = 0; /* Start by setting up the first buffer. */ bd.bufptr = segs[0].ds_addr; bd.bufoff = 0; bd.buflen = segs[0].ds_len; bd.pktlen = m_length(slot->mbuf, NULL); bd.flags = CPDMA_BD_SOP | CPDMA_BD_OWNER; if (sc->swsc->dualemac) { bd.flags |= CPDMA_BD_TO_PORT; bd.flags |= ((sc->unit + 1) & CPDMA_BD_PORT_MASK); } for (seg = 1; seg < nsegs; ++seg) { /* Save the previous buffer (which isn't EOP) */ cpsw_cpdma_write_bd(sc->swsc, slot, &bd); STAILQ_REMOVE_HEAD(&sc->swsc->tx.avail, next); STAILQ_INSERT_TAIL(&sc->swsc->tx.active, slot, next); slot = STAILQ_FIRST(&sc->swsc->tx.avail); /* Setup next buffer (which isn't SOP) */ if (nsegs > seg + 1) { next = STAILQ_NEXT(slot, next); bd.next = cpsw_cpdma_bd_paddr(sc->swsc, next); } else bd.next = 0; bd.bufptr = segs[seg].ds_addr; bd.bufoff = 0; bd.buflen = segs[seg].ds_len; bd.pktlen = 0; bd.flags = CPDMA_BD_OWNER; } /* Save the final buffer. */ bd.flags |= CPDMA_BD_EOP; cpsw_cpdma_write_bd(sc->swsc, slot, &bd); STAILQ_REMOVE_HEAD(&sc->swsc->tx.avail, next); STAILQ_INSERT_TAIL(&sc->swsc->tx.active, slot, next); last = slot; added += nsegs; if (nsegs > sc->swsc->tx.longest_chain) sc->swsc->tx.longest_chain = nsegs; BPF_MTAP(sc->ifp, m0); } if (first_new_slot == NULL) return; /* Attach the list of new buffers to the hardware TX queue. */ if (last_old_slot != NULL && (cpsw_cpdma_read_bd_flags(sc->swsc, last_old_slot) & CPDMA_BD_EOQ) == 0) { /* Add buffers to end of current queue. */ cpsw_cpdma_write_bd_next(sc->swsc, last_old_slot, first_new_slot); } else { /* Start a fresh queue. */ cpsw_write_hdp_slot(sc->swsc, &sc->swsc->tx, first_new_slot); } sc->swsc->tx.queue_adds += added; sc->swsc->tx.avail_queue_len -= added; sc->swsc->tx.active_queue_len += added; if (sc->swsc->tx.active_queue_len > sc->swsc->tx.max_active_queue_len) { sc->swsc->tx.max_active_queue_len = sc->swsc->tx.active_queue_len; } CPSW_DEBUGF(sc->swsc, ("Queued %d TX packet(s)", added)); } static int cpsw_tx_dequeue(struct cpsw_softc *sc) { struct cpsw_slot *slot, *last_removed_slot = NULL; struct cpsw_cpdma_bd bd; uint32_t flags, removed = 0; /* Pull completed buffers off the hardware TX queue. */ slot = STAILQ_FIRST(&sc->tx.active); while (slot != NULL) { flags = cpsw_cpdma_read_bd_flags(sc, slot); /* TearDown complete is only marked on the SOP for the packet. */ if ((flags & (CPDMA_BD_SOP | CPDMA_BD_TDOWNCMPLT)) == (CPDMA_BD_SOP | CPDMA_BD_TDOWNCMPLT)) { sc->tx.teardown = 1; } if ((flags & (CPDMA_BD_SOP | CPDMA_BD_OWNER)) == (CPDMA_BD_SOP | CPDMA_BD_OWNER) && sc->tx.teardown == 0) break; /* Hardware is still using this packet. */ bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap); m_freem(slot->mbuf); slot->mbuf = NULL; if (slot->ifp) { if (sc->tx.teardown == 0) if_inc_counter(slot->ifp, IFCOUNTER_OPACKETS, 1); else if_inc_counter(slot->ifp, IFCOUNTER_OQDROPS, 1); } /* Dequeue any additional buffers used by this packet. */ while (slot != NULL && slot->mbuf == NULL) { STAILQ_REMOVE_HEAD(&sc->tx.active, next); STAILQ_INSERT_TAIL(&sc->tx.avail, slot, next); ++removed; last_removed_slot = slot; slot = STAILQ_FIRST(&sc->tx.active); } cpsw_write_cp_slot(sc, &sc->tx, last_removed_slot); /* Restart the TX queue if necessary. */ cpsw_cpdma_read_bd(sc, last_removed_slot, &bd); if (slot != NULL && bd.next != 0 && (bd.flags & (CPDMA_BD_EOP | CPDMA_BD_OWNER | CPDMA_BD_EOQ)) == (CPDMA_BD_EOP | CPDMA_BD_EOQ)) { cpsw_write_hdp_slot(sc, &sc->tx, slot); sc->tx.queue_restart++; break; } } if (removed != 0) { sc->tx.queue_removes += removed; sc->tx.active_queue_len -= removed; sc->tx.avail_queue_len += removed; if (sc->tx.avail_queue_len > sc->tx.max_avail_queue_len) sc->tx.max_avail_queue_len = sc->tx.avail_queue_len; CPSW_DEBUGF(sc, ("TX removed %d completed packet(s)", removed)); } if (sc->tx.teardown && STAILQ_EMPTY(&sc->tx.active)) { CPSW_DEBUGF(sc, ("TX teardown is complete")); sc->tx.teardown = 0; sc->tx.running = 0; } return (removed); } /* * * Miscellaneous interrupts. * */ static void cpsw_intr_rx_thresh(void *arg) { struct cpsw_softc *sc; struct ifnet *ifp; struct mbuf *received, *next; sc = (struct cpsw_softc *)arg; CPSW_RX_LOCK(sc); received = cpsw_rx_dequeue(sc); cpsw_rx_enqueue(sc); cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 0); CPSW_RX_UNLOCK(sc); while (received != NULL) { next = received->m_nextpkt; received->m_nextpkt = NULL; ifp = received->m_pkthdr.rcvif; (*ifp->if_input)(ifp, received); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); received = next; } } static void cpsw_intr_misc_host_error(struct cpsw_softc *sc) { uint32_t intstat; uint32_t dmastat; int txerr, rxerr, txchan, rxchan; printf("\n\n"); device_printf(sc->dev, "HOST ERROR: PROGRAMMING ERROR DETECTED BY HARDWARE\n"); printf("\n\n"); intstat = cpsw_read_4(sc, CPSW_CPDMA_DMA_INTSTAT_MASKED); device_printf(sc->dev, "CPSW_CPDMA_DMA_INTSTAT_MASKED=0x%x\n", intstat); dmastat = cpsw_read_4(sc, CPSW_CPDMA_DMASTATUS); device_printf(sc->dev, "CPSW_CPDMA_DMASTATUS=0x%x\n", dmastat); txerr = (dmastat >> 20) & 15; txchan = (dmastat >> 16) & 7; rxerr = (dmastat >> 12) & 15; rxchan = (dmastat >> 8) & 7; switch (txerr) { case 0: break; case 1: printf("SOP error on TX channel %d\n", txchan); break; case 2: printf("Ownership bit not set on SOP buffer on TX channel %d\n", txchan); break; case 3: printf("Zero Next Buffer but not EOP on TX channel %d\n", txchan); break; case 4: printf("Zero Buffer Pointer on TX channel %d\n", txchan); break; case 5: printf("Zero Buffer Length on TX channel %d\n", txchan); break; case 6: printf("Packet length error on TX channel %d\n", txchan); break; default: printf("Unknown error on TX channel %d\n", txchan); break; } if (txerr != 0) { printf("CPSW_CPDMA_TX%d_HDP=0x%x\n", txchan, cpsw_read_4(sc, CPSW_CPDMA_TX_HDP(txchan))); printf("CPSW_CPDMA_TX%d_CP=0x%x\n", txchan, cpsw_read_4(sc, CPSW_CPDMA_TX_CP(txchan))); cpsw_dump_queue(sc, &sc->tx.active); } switch (rxerr) { case 0: break; case 2: printf("Ownership bit not set on RX channel %d\n", rxchan); break; case 4: printf("Zero Buffer Pointer on RX channel %d\n", rxchan); break; case 5: printf("Zero Buffer Length on RX channel %d\n", rxchan); break; case 6: printf("Buffer offset too big on RX channel %d\n", rxchan); break; default: printf("Unknown RX error on RX channel %d\n", rxchan); break; } if (rxerr != 0) { printf("CPSW_CPDMA_RX%d_HDP=0x%x\n", rxchan, cpsw_read_4(sc,CPSW_CPDMA_RX_HDP(rxchan))); printf("CPSW_CPDMA_RX%d_CP=0x%x\n", rxchan, cpsw_read_4(sc, CPSW_CPDMA_RX_CP(rxchan))); cpsw_dump_queue(sc, &sc->rx.active); } printf("\nALE Table\n"); cpsw_ale_dump_table(sc); // XXX do something useful here?? panic("CPSW HOST ERROR INTERRUPT"); // Suppress this interrupt in the future. cpsw_write_4(sc, CPSW_CPDMA_DMA_INTMASK_CLEAR, intstat); printf("XXX HOST ERROR INTERRUPT SUPPRESSED\n"); // The watchdog will probably reset the controller // in a little while. It will probably fail again. } static void cpsw_intr_misc(void *arg) { struct cpsw_softc *sc = arg; uint32_t stat = cpsw_read_4(sc, CPSW_WR_C_MISC_STAT(0)); if (stat & CPSW_WR_C_MISC_EVNT_PEND) CPSW_DEBUGF(sc, ("Time sync event interrupt unimplemented")); if (stat & CPSW_WR_C_MISC_STAT_PEND) cpsw_stats_collect(sc); if (stat & CPSW_WR_C_MISC_HOST_PEND) cpsw_intr_misc_host_error(sc); if (stat & CPSW_WR_C_MISC_MDIOLINK) { cpsw_write_4(sc, MDIOLINKINTMASKED, cpsw_read_4(sc, MDIOLINKINTMASKED)); } if (stat & CPSW_WR_C_MISC_MDIOUSER) { CPSW_DEBUGF(sc, ("MDIO operation completed interrupt unimplemented")); } cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 3); } /* * * Periodic Checks and Watchdog. * */ static void cpswp_tick(void *msc) { struct cpswp_softc *sc = msc; /* Check for media type change */ mii_tick(sc->mii); if (sc->media_status != sc->mii->mii_media.ifm_media) { printf("%s: media type changed (ifm_media=%x)\n", __func__, sc->mii->mii_media.ifm_media); cpswp_ifmedia_upd(sc->ifp); } /* Schedule another timeout one second from now */ callout_reset(&sc->mii_callout, hz, cpswp_tick, sc); } static void cpswp_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct cpswp_softc *sc; struct mii_data *mii; sc = ifp->if_softc; CPSW_DEBUGF(sc->swsc, ("")); CPSW_PORT_LOCK(sc); mii = sc->mii; mii_pollstat(mii); ifmr->ifm_active = mii->mii_media_active; ifmr->ifm_status = mii->mii_media_status; CPSW_PORT_UNLOCK(sc); } static int cpswp_ifmedia_upd(struct ifnet *ifp) { struct cpswp_softc *sc; sc = ifp->if_softc; CPSW_DEBUGF(sc->swsc, ("")); CPSW_PORT_LOCK(sc); mii_mediachg(sc->mii); sc->media_status = sc->mii->mii_media.ifm_media; CPSW_PORT_UNLOCK(sc); return (0); } static void cpsw_tx_watchdog_full_reset(struct cpsw_softc *sc) { struct cpswp_softc *psc; int i; cpsw_debugf_head("CPSW watchdog"); device_printf(sc->dev, "watchdog timeout\n"); printf("CPSW_CPDMA_TX%d_HDP=0x%x\n", 0, cpsw_read_4(sc, CPSW_CPDMA_TX_HDP(0))); printf("CPSW_CPDMA_TX%d_CP=0x%x\n", 0, cpsw_read_4(sc, CPSW_CPDMA_TX_CP(0))); cpsw_dump_queue(sc, &sc->tx.active); for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; psc = device_get_softc(sc->port[i].dev); CPSW_PORT_LOCK(psc); cpswp_stop_locked(psc); CPSW_PORT_UNLOCK(psc); } } static void cpsw_tx_watchdog(void *msc) { struct cpsw_softc *sc; sc = msc; CPSW_TX_LOCK(sc); if (sc->tx.active_queue_len == 0 || !sc->tx.running) { sc->watchdog.timer = 0; /* Nothing to do. */ } else if (sc->tx.queue_removes > sc->tx.queue_removes_at_last_tick) { sc->watchdog.timer = 0; /* Stuff done while we weren't looking. */ } else if (cpsw_tx_dequeue(sc) > 0) { sc->watchdog.timer = 0; /* We just did something. */ } else { /* There was something to do but it didn't get done. */ ++sc->watchdog.timer; if (sc->watchdog.timer > 5) { sc->watchdog.timer = 0; ++sc->watchdog.resets; cpsw_tx_watchdog_full_reset(sc); } } sc->tx.queue_removes_at_last_tick = sc->tx.queue_removes; CPSW_TX_UNLOCK(sc); /* Schedule another timeout one second from now */ callout_reset(&sc->watchdog.callout, hz, cpsw_tx_watchdog, sc); } /* * * ALE support routines. * */ static void cpsw_ale_read_entry(struct cpsw_softc *sc, uint16_t idx, uint32_t *ale_entry) { cpsw_write_4(sc, CPSW_ALE_TBLCTL, idx & 1023); ale_entry[0] = cpsw_read_4(sc, CPSW_ALE_TBLW0); ale_entry[1] = cpsw_read_4(sc, CPSW_ALE_TBLW1); ale_entry[2] = cpsw_read_4(sc, CPSW_ALE_TBLW2); } static void cpsw_ale_write_entry(struct cpsw_softc *sc, uint16_t idx, uint32_t *ale_entry) { cpsw_write_4(sc, CPSW_ALE_TBLW0, ale_entry[0]); cpsw_write_4(sc, CPSW_ALE_TBLW1, ale_entry[1]); cpsw_write_4(sc, CPSW_ALE_TBLW2, ale_entry[2]); cpsw_write_4(sc, CPSW_ALE_TBLCTL, 1 << 31 | (idx & 1023)); } static void cpsw_ale_remove_all_mc_entries(struct cpsw_softc *sc) { int i; uint32_t ale_entry[3]; /* First four entries are link address and broadcast. */ for (i = 10; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); if ((ALE_TYPE(ale_entry) == ALE_TYPE_ADDR || ALE_TYPE(ale_entry) == ALE_TYPE_VLAN_ADDR) && ALE_MCAST(ale_entry) == 1) { /* MCast link addr */ ale_entry[0] = ale_entry[1] = ale_entry[2] = 0; cpsw_ale_write_entry(sc, i, ale_entry); } } } static int cpsw_ale_mc_entry_set(struct cpsw_softc *sc, uint8_t portmap, int vlan, uint8_t *mac) { int free_index = -1, matching_index = -1, i; uint32_t ale_entry[3], ale_type; /* Find a matching entry or a free entry. */ for (i = 10; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); /* Entry Type[61:60] is 0 for free entry */ if (free_index < 0 && ALE_TYPE(ale_entry) == 0) free_index = i; if ((((ale_entry[1] >> 8) & 0xFF) == mac[0]) && (((ale_entry[1] >> 0) & 0xFF) == mac[1]) && (((ale_entry[0] >>24) & 0xFF) == mac[2]) && (((ale_entry[0] >>16) & 0xFF) == mac[3]) && (((ale_entry[0] >> 8) & 0xFF) == mac[4]) && (((ale_entry[0] >> 0) & 0xFF) == mac[5])) { matching_index = i; break; } } if (matching_index < 0) { if (free_index < 0) return (ENOMEM); i = free_index; } if (vlan != -1) ale_type = ALE_TYPE_VLAN_ADDR << 28 | vlan << 16; else ale_type = ALE_TYPE_ADDR << 28; /* Set MAC address */ ale_entry[0] = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5]; ale_entry[1] = mac[0] << 8 | mac[1]; /* Entry type[61:60] and Mcast fwd state[63:62] is fw(3). */ ale_entry[1] |= ALE_MCAST_FWD | ale_type; /* Set portmask [68:66] */ ale_entry[2] = (portmap & 7) << 2; cpsw_ale_write_entry(sc, i, ale_entry); return 0; } static void cpsw_ale_dump_table(struct cpsw_softc *sc) { int i; uint32_t ale_entry[3]; for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); switch (ALE_TYPE(ale_entry)) { case ALE_TYPE_VLAN: printf("ALE[%4u] %08x %08x %08x ", i, ale_entry[2], ale_entry[1], ale_entry[0]); printf("type: %u ", ALE_TYPE(ale_entry)); printf("vlan: %u ", ALE_VLAN(ale_entry)); printf("untag: %u ", ALE_VLAN_UNTAG(ale_entry)); printf("reg flood: %u ", ALE_VLAN_REGFLOOD(ale_entry)); printf("unreg flood: %u ", ALE_VLAN_UNREGFLOOD(ale_entry)); printf("members: %u ", ALE_VLAN_MEMBERS(ale_entry)); printf("\n"); break; case ALE_TYPE_ADDR: case ALE_TYPE_VLAN_ADDR: printf("ALE[%4u] %08x %08x %08x ", i, ale_entry[2], ale_entry[1], ale_entry[0]); printf("type: %u ", ALE_TYPE(ale_entry)); printf("mac: %02x:%02x:%02x:%02x:%02x:%02x ", (ale_entry[1] >> 8) & 0xFF, (ale_entry[1] >> 0) & 0xFF, (ale_entry[0] >>24) & 0xFF, (ale_entry[0] >>16) & 0xFF, (ale_entry[0] >> 8) & 0xFF, (ale_entry[0] >> 0) & 0xFF); printf(ALE_MCAST(ale_entry) ? "mcast " : "ucast "); if (ALE_TYPE(ale_entry) == ALE_TYPE_VLAN_ADDR) printf("vlan: %u ", ALE_VLAN(ale_entry)); printf("port: %u ", ALE_PORTS(ale_entry)); printf("\n"); break; } } printf("\n"); } static int cpswp_ale_update_addresses(struct cpswp_softc *sc, int purge) { uint8_t *mac; uint32_t ale_entry[3], ale_type, portmask; struct ifmultiaddr *ifma; if (sc->swsc->dualemac) { ale_type = ALE_TYPE_VLAN_ADDR << 28 | sc->vlan << 16; portmask = 1 << (sc->unit + 1) | 1 << 0; } else { ale_type = ALE_TYPE_ADDR << 28; portmask = 7; } /* * Route incoming packets for our MAC address to Port 0 (host). * For simplicity, keep this entry at table index 0 for port 1 and * at index 2 for port 2 in the ALE. */ if_addr_rlock(sc->ifp); mac = LLADDR((struct sockaddr_dl *)sc->ifp->if_addr->ifa_addr); ale_entry[0] = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5]; ale_entry[1] = ale_type | mac[0] << 8 | mac[1]; /* addr entry + mac */ ale_entry[2] = 0; /* port = 0 */ cpsw_ale_write_entry(sc->swsc, 0 + 2 * sc->unit, ale_entry); /* Set outgoing MAC Address for slave port. */ cpsw_write_4(sc->swsc, CPSW_PORT_P_SA_HI(sc->unit + 1), mac[3] << 24 | mac[2] << 16 | mac[1] << 8 | mac[0]); cpsw_write_4(sc->swsc, CPSW_PORT_P_SA_LO(sc->unit + 1), mac[5] << 8 | mac[4]); if_addr_runlock(sc->ifp); /* Keep the broadcast address at table entry 1 (or 3). */ ale_entry[0] = 0xffffffff; /* Lower 32 bits of MAC */ /* ALE_MCAST_FWD, Addr type, upper 16 bits of Mac */ ale_entry[1] = ALE_MCAST_FWD | ale_type | 0xffff; ale_entry[2] = portmask << 2; cpsw_ale_write_entry(sc->swsc, 1 + 2 * sc->unit, ale_entry); /* SIOCDELMULTI doesn't specify the particular address being removed, so we have to remove all and rebuild. */ if (purge) cpsw_ale_remove_all_mc_entries(sc->swsc); /* Set other multicast addrs desired. */ if_maddr_rlock(sc->ifp); CK_STAILQ_FOREACH(ifma, &sc->ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; cpsw_ale_mc_entry_set(sc->swsc, portmask, sc->vlan, LLADDR((struct sockaddr_dl *)ifma->ifma_addr)); } if_maddr_runlock(sc->ifp); return (0); } static int cpsw_ale_update_vlan_table(struct cpsw_softc *sc, int vlan, int ports, int untag, int mcregflood, int mcunregflood) { int free_index, i, matching_index; uint32_t ale_entry[3]; free_index = matching_index = -1; /* Find a matching entry or a free entry. */ for (i = 5; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); /* Entry Type[61:60] is 0 for free entry */ if (free_index < 0 && ALE_TYPE(ale_entry) == 0) free_index = i; if (ALE_VLAN(ale_entry) == vlan) { matching_index = i; break; } } if (matching_index < 0) { if (free_index < 0) return (-1); i = free_index; } ale_entry[0] = (untag & 7) << 24 | (mcregflood & 7) << 16 | (mcunregflood & 7) << 8 | (ports & 7); ale_entry[1] = ALE_TYPE_VLAN << 28 | vlan << 16; ale_entry[2] = 0; cpsw_ale_write_entry(sc, i, ale_entry); return (0); } /* * * Statistics and Sysctls. * */ #if 0 static void cpsw_stats_dump(struct cpsw_softc *sc) { int i; uint32_t r; for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) { r = cpsw_read_4(sc, CPSW_STATS_OFFSET + cpsw_stat_sysctls[i].reg); CPSW_DEBUGF(sc, ("%s: %ju + %u = %ju", cpsw_stat_sysctls[i].oid, (intmax_t)sc->shadow_stats[i], r, (intmax_t)sc->shadow_stats[i] + r)); } } #endif static void cpsw_stats_collect(struct cpsw_softc *sc) { int i; uint32_t r; CPSW_DEBUGF(sc, ("Controller shadow statistics updated.")); for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) { r = cpsw_read_4(sc, CPSW_STATS_OFFSET + cpsw_stat_sysctls[i].reg); sc->shadow_stats[i] += r; cpsw_write_4(sc, CPSW_STATS_OFFSET + cpsw_stat_sysctls[i].reg, r); } } static int cpsw_stats_sysctl(SYSCTL_HANDLER_ARGS) { struct cpsw_softc *sc; struct cpsw_stat *stat; uint64_t result; sc = (struct cpsw_softc *)arg1; stat = &cpsw_stat_sysctls[oidp->oid_number]; result = sc->shadow_stats[oidp->oid_number]; result += cpsw_read_4(sc, CPSW_STATS_OFFSET + stat->reg); return (sysctl_handle_64(oidp, &result, 0, req)); } static int cpsw_stat_attached(SYSCTL_HANDLER_ARGS) { struct cpsw_softc *sc; struct bintime t; unsigned result; sc = (struct cpsw_softc *)arg1; getbinuptime(&t); bintime_sub(&t, &sc->attach_uptime); result = t.sec; return (sysctl_handle_int(oidp, &result, 0, req)); } static int cpsw_intr_coalesce(SYSCTL_HANDLER_ARGS) { int error; struct cpsw_softc *sc; uint32_t ctrl, intr_per_ms; sc = (struct cpsw_softc *)arg1; error = sysctl_handle_int(oidp, &sc->coal_us, 0, req); if (error != 0 || req->newptr == NULL) return (error); ctrl = cpsw_read_4(sc, CPSW_WR_INT_CONTROL); ctrl &= ~(CPSW_WR_INT_PACE_EN | CPSW_WR_INT_PRESCALE_MASK); if (sc->coal_us == 0) { /* Disable the interrupt pace hardware. */ cpsw_write_4(sc, CPSW_WR_INT_CONTROL, ctrl); cpsw_write_4(sc, CPSW_WR_C_RX_IMAX(0), 0); cpsw_write_4(sc, CPSW_WR_C_TX_IMAX(0), 0); return (0); } if (sc->coal_us > CPSW_WR_C_IMAX_US_MAX) sc->coal_us = CPSW_WR_C_IMAX_US_MAX; if (sc->coal_us < CPSW_WR_C_IMAX_US_MIN) sc->coal_us = CPSW_WR_C_IMAX_US_MIN; intr_per_ms = 1000 / sc->coal_us; /* Just to make sure... */ if (intr_per_ms > CPSW_WR_C_IMAX_MAX) intr_per_ms = CPSW_WR_C_IMAX_MAX; if (intr_per_ms < CPSW_WR_C_IMAX_MIN) intr_per_ms = CPSW_WR_C_IMAX_MIN; /* Set the prescale to produce 4us pulses from the 125 Mhz clock. */ ctrl |= (125 * 4) & CPSW_WR_INT_PRESCALE_MASK; /* Enable the interrupt pace hardware. */ cpsw_write_4(sc, CPSW_WR_C_RX_IMAX(0), intr_per_ms); cpsw_write_4(sc, CPSW_WR_C_TX_IMAX(0), intr_per_ms); ctrl |= CPSW_WR_INT_C0_RX_PULSE | CPSW_WR_INT_C0_TX_PULSE; cpsw_write_4(sc, CPSW_WR_INT_CONTROL, ctrl); return (0); } static int cpsw_stat_uptime(SYSCTL_HANDLER_ARGS) { struct cpsw_softc *swsc; struct cpswp_softc *sc; struct bintime t; unsigned result; swsc = arg1; sc = device_get_softc(swsc->port[arg2].dev); if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) { getbinuptime(&t); bintime_sub(&t, &sc->init_uptime); result = t.sec; } else result = 0; return (sysctl_handle_int(oidp, &result, 0, req)); } static void cpsw_add_queue_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *node, struct cpsw_queue *queue) { struct sysctl_oid_list *parent; parent = SYSCTL_CHILDREN(node); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "totalBuffers", CTLFLAG_RD, &queue->queue_slots, 0, "Total buffers currently assigned to this queue"); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "activeBuffers", CTLFLAG_RD, &queue->active_queue_len, 0, "Buffers currently registered with hardware controller"); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "maxActiveBuffers", CTLFLAG_RD, &queue->max_active_queue_len, 0, "Max value of activeBuffers since last driver reset"); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "availBuffers", CTLFLAG_RD, &queue->avail_queue_len, 0, "Buffers allocated to this queue but not currently " "registered with hardware controller"); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "maxAvailBuffers", CTLFLAG_RD, &queue->max_avail_queue_len, 0, "Max value of availBuffers since last driver reset"); SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "totalEnqueued", CTLFLAG_RD, &queue->queue_adds, 0, "Total buffers added to queue"); SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "totalDequeued", CTLFLAG_RD, &queue->queue_removes, 0, "Total buffers removed from queue"); SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "queueRestart", CTLFLAG_RD, &queue->queue_restart, 0, "Total times the queue has been restarted"); SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "longestChain", CTLFLAG_RD, &queue->longest_chain, 0, "Max buffers used for a single packet"); } static void cpsw_add_watchdog_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *node, struct cpsw_softc *sc) { struct sysctl_oid_list *parent; parent = SYSCTL_CHILDREN(node); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "resets", CTLFLAG_RD, &sc->watchdog.resets, 0, "Total number of watchdog resets"); } static void cpsw_add_sysctls(struct cpsw_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *stats_node, *queue_node, *node; struct sysctl_oid_list *parent, *stats_parent, *queue_parent; struct sysctl_oid_list *ports_parent, *port_parent; char port[16]; int i; ctx = device_get_sysctl_ctx(sc->dev); parent = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "debug", CTLFLAG_RW, &sc->debug, 0, "Enable switch debug messages"); SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "attachedSecs", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, cpsw_stat_attached, "IU", "Time since driver attach"); SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "intr_coalesce_us", CTLTYPE_UINT | CTLFLAG_RW, sc, 0, cpsw_intr_coalesce, "IU", "minimum time between interrupts"); node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "ports", CTLFLAG_RD, NULL, "CPSW Ports Statistics"); ports_parent = SYSCTL_CHILDREN(node); for (i = 0; i < CPSW_PORTS; i++) { if (!sc->dualemac && i != sc->active_slave) continue; port[0] = '0' + i; port[1] = '\0'; node = SYSCTL_ADD_NODE(ctx, ports_parent, OID_AUTO, port, CTLFLAG_RD, NULL, "CPSW Port Statistics"); port_parent = SYSCTL_CHILDREN(node); SYSCTL_ADD_PROC(ctx, port_parent, OID_AUTO, "uptime", CTLTYPE_UINT | CTLFLAG_RD, sc, i, cpsw_stat_uptime, "IU", "Seconds since driver init"); } stats_node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "stats", CTLFLAG_RD, NULL, "CPSW Statistics"); stats_parent = SYSCTL_CHILDREN(stats_node); for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) { SYSCTL_ADD_PROC(ctx, stats_parent, i, cpsw_stat_sysctls[i].oid, CTLTYPE_U64 | CTLFLAG_RD, sc, 0, cpsw_stats_sysctl, "IU", cpsw_stat_sysctls[i].oid); } queue_node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "queue", CTLFLAG_RD, NULL, "CPSW Queue Statistics"); queue_parent = SYSCTL_CHILDREN(queue_node); node = SYSCTL_ADD_NODE(ctx, queue_parent, OID_AUTO, "tx", CTLFLAG_RD, NULL, "TX Queue Statistics"); cpsw_add_queue_sysctls(ctx, node, &sc->tx); node = SYSCTL_ADD_NODE(ctx, queue_parent, OID_AUTO, "rx", CTLFLAG_RD, NULL, "RX Queue Statistics"); cpsw_add_queue_sysctls(ctx, node, &sc->rx); node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "watchdog", CTLFLAG_RD, NULL, "Watchdog Statistics"); cpsw_add_watchdog_sysctls(ctx, node, sc); } #ifdef CPSW_ETHERSWITCH static etherswitch_info_t etherswitch_info = { .es_nports = CPSW_PORTS + 1, .es_nvlangroups = CPSW_VLANS, .es_name = "TI Common Platform Ethernet Switch (CPSW)", .es_vlan_caps = ETHERSWITCH_VLAN_DOT1Q, }; static etherswitch_info_t * cpsw_getinfo(device_t dev) { return (ðerswitch_info); } static int cpsw_getport(device_t dev, etherswitch_port_t *p) { int err; struct cpsw_softc *sc; struct cpswp_softc *psc; struct ifmediareq *ifmr; uint32_t reg; if (p->es_port < 0 || p->es_port > CPSW_PORTS) return (ENXIO); err = 0; sc = device_get_softc(dev); if (p->es_port == CPSW_CPU_PORT) { p->es_flags |= ETHERSWITCH_PORT_CPU; ifmr = &p->es_ifmr; ifmr->ifm_current = ifmr->ifm_active = IFM_ETHER | IFM_1000_T | IFM_FDX; ifmr->ifm_mask = 0; ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID; ifmr->ifm_count = 0; } else { psc = device_get_softc(sc->port[p->es_port - 1].dev); err = ifmedia_ioctl(psc->ifp, &p->es_ifr, &psc->mii->mii_media, SIOCGIFMEDIA); } reg = cpsw_read_4(sc, CPSW_PORT_P_VLAN(p->es_port)); p->es_pvid = reg & ETHERSWITCH_VID_MASK; reg = cpsw_read_4(sc, CPSW_ALE_PORTCTL(p->es_port)); if (reg & ALE_PORTCTL_DROP_UNTAGGED) p->es_flags |= ETHERSWITCH_PORT_DROPUNTAGGED; if (reg & ALE_PORTCTL_INGRESS) p->es_flags |= ETHERSWITCH_PORT_INGRESS; return (err); } static int cpsw_setport(device_t dev, etherswitch_port_t *p) { struct cpsw_softc *sc; struct cpswp_softc *psc; struct ifmedia *ifm; uint32_t reg; if (p->es_port < 0 || p->es_port > CPSW_PORTS) return (ENXIO); sc = device_get_softc(dev); if (p->es_pvid != 0) { cpsw_write_4(sc, CPSW_PORT_P_VLAN(p->es_port), p->es_pvid & ETHERSWITCH_VID_MASK); } reg = cpsw_read_4(sc, CPSW_ALE_PORTCTL(p->es_port)); if (p->es_flags & ETHERSWITCH_PORT_DROPUNTAGGED) reg |= ALE_PORTCTL_DROP_UNTAGGED; else reg &= ~ALE_PORTCTL_DROP_UNTAGGED; if (p->es_flags & ETHERSWITCH_PORT_INGRESS) reg |= ALE_PORTCTL_INGRESS; else reg &= ~ALE_PORTCTL_INGRESS; cpsw_write_4(sc, CPSW_ALE_PORTCTL(p->es_port), reg); /* CPU port does not allow media settings. */ if (p->es_port == CPSW_CPU_PORT) return (0); psc = device_get_softc(sc->port[p->es_port - 1].dev); ifm = &psc->mii->mii_media; return (ifmedia_ioctl(psc->ifp, &p->es_ifr, ifm, SIOCSIFMEDIA)); } static int cpsw_getconf(device_t dev, etherswitch_conf_t *conf) { /* Return the VLAN mode. */ conf->cmd = ETHERSWITCH_CONF_VLAN_MODE; conf->vlan_mode = ETHERSWITCH_VLAN_DOT1Q; return (0); } static int cpsw_getvgroup(device_t dev, etherswitch_vlangroup_t *vg) { int i, vid; uint32_t ale_entry[3]; struct cpsw_softc *sc; sc = device_get_softc(dev); if (vg->es_vlangroup >= CPSW_VLANS) return (EINVAL); vg->es_vid = 0; vid = cpsw_vgroups[vg->es_vlangroup].vid; if (vid == -1) return (0); for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); if (ALE_TYPE(ale_entry) != ALE_TYPE_VLAN) continue; if (vid != ALE_VLAN(ale_entry)) continue; vg->es_fid = 0; vg->es_vid = ALE_VLAN(ale_entry) | ETHERSWITCH_VID_VALID; vg->es_member_ports = ALE_VLAN_MEMBERS(ale_entry); vg->es_untagged_ports = ALE_VLAN_UNTAG(ale_entry); } return (0); } static void cpsw_remove_vlan(struct cpsw_softc *sc, int vlan) { int i; uint32_t ale_entry[3]; for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) { cpsw_ale_read_entry(sc, i, ale_entry); if (ALE_TYPE(ale_entry) != ALE_TYPE_VLAN) continue; if (vlan != ALE_VLAN(ale_entry)) continue; ale_entry[0] = ale_entry[1] = ale_entry[2] = 0; cpsw_ale_write_entry(sc, i, ale_entry); break; } } static int cpsw_setvgroup(device_t dev, etherswitch_vlangroup_t *vg) { int i; struct cpsw_softc *sc; sc = device_get_softc(dev); for (i = 0; i < CPSW_VLANS; i++) { /* Is this Vlan ID in use by another vlangroup ? */ if (vg->es_vlangroup != i && cpsw_vgroups[i].vid == vg->es_vid) return (EINVAL); } if (vg->es_vid == 0) { if (cpsw_vgroups[vg->es_vlangroup].vid == -1) return (0); cpsw_remove_vlan(sc, cpsw_vgroups[vg->es_vlangroup].vid); cpsw_vgroups[vg->es_vlangroup].vid = -1; vg->es_untagged_ports = 0; vg->es_member_ports = 0; vg->es_vid = 0; return (0); } vg->es_vid &= ETHERSWITCH_VID_MASK; vg->es_member_ports &= CPSW_PORTS_MASK; vg->es_untagged_ports &= CPSW_PORTS_MASK; if (cpsw_vgroups[vg->es_vlangroup].vid != -1 && cpsw_vgroups[vg->es_vlangroup].vid != vg->es_vid) return (EINVAL); cpsw_vgroups[vg->es_vlangroup].vid = vg->es_vid; cpsw_ale_update_vlan_table(sc, vg->es_vid, vg->es_member_ports, vg->es_untagged_ports, vg->es_member_ports, 0); return (0); } static int cpsw_readreg(device_t dev, int addr) { /* Not supported. */ return (0); } static int cpsw_writereg(device_t dev, int addr, int value) { /* Not supported. */ return (0); } static int cpsw_readphy(device_t dev, int phy, int reg) { /* Not supported. */ return (0); } static int cpsw_writephy(device_t dev, int phy, int reg, int data) { /* Not supported. */ return (0); } #endif Index: projects/fuse2/sys/arm/ti/ti_hwmods.c =================================================================== --- projects/fuse2/sys/arm/ti/ti_hwmods.c (revision 350434) +++ projects/fuse2/sys/arm/ti/ti_hwmods.c (revision 350435) @@ -1,204 +1,214 @@ /*- * Copyright (c) 2015 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include struct hwmod { const char *name; int clock_id; }; struct hwmod ti_hwmods[] = { {"i2c1", I2C1_CLK}, {"i2c2", I2C2_CLK}, {"i2c3", I2C3_CLK}, {"i2c4", I2C4_CLK}, {"i2c5", I2C5_CLK}, {"gpio1", GPIO1_CLK}, {"gpio2", GPIO2_CLK}, {"gpio3", GPIO3_CLK}, {"gpio4", GPIO4_CLK}, {"gpio5", GPIO5_CLK}, {"gpio6", GPIO6_CLK}, {"gpio7", GPIO7_CLK}, {"mmc1", MMC1_CLK}, {"mmc2", MMC2_CLK}, {"mmc3", MMC3_CLK}, {"mmc4", MMC4_CLK}, {"mmc5", MMC5_CLK}, {"mmc6", MMC6_CLK}, {"epwmss0", PWMSS0_CLK}, {"epwmss1", PWMSS1_CLK}, {"epwmss2", PWMSS2_CLK}, {"spi0", SPI0_CLK}, {"spi1", SPI1_CLK}, {"timer1", TIMER1_CLK}, {"timer2", TIMER2_CLK}, {"timer3", TIMER3_CLK}, {"timer4", TIMER4_CLK}, {"timer5", TIMER5_CLK}, {"timer6", TIMER6_CLK}, {"timer7", TIMER7_CLK}, {"uart1", UART1_CLK}, {"uart2", UART2_CLK}, {"uart3", UART3_CLK}, {"uart4", UART4_CLK}, {"uart5", UART5_CLK}, {"uart6", UART6_CLK}, {"uart7", UART7_CLK}, {NULL, 0} }; +static inline int +ti_get_hwmods_prop(phandle_t node, void **name) +{ + int len; + + if ((len = OF_getprop_alloc(node, "ti,hwmods", name)) > 0) + return (len); + return (OF_getprop_alloc(OF_parent(node), "ti,hwmods", name)); +} + clk_ident_t ti_hwmods_get_clock(device_t dev) { phandle_t node; int len, l; char *name; char *buf; int clk; struct hwmod *hw; if ((node = ofw_bus_get_node(dev)) == 0) return (INVALID_CLK_IDENT); - if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (INVALID_CLK_IDENT); buf = name; clk = INVALID_CLK_IDENT; while ((len > 0) && (clk == INVALID_CLK_IDENT)) { for (hw = ti_hwmods; hw->name != NULL; ++hw) { if (strcmp(hw->name, name) == 0) { clk = hw->clock_id; break; } } /* Slide to the next sub-string. */ l = strlen(name) + 1; name += l; len -= l; } if (len > 0) device_printf(dev, "WARNING: more than one ti,hwmod \n"); OF_prop_free(buf); return (clk); } int ti_hwmods_contains(device_t dev, const char *hwmod) { phandle_t node; int len, l; char *name; char *buf; int result; if ((node = ofw_bus_get_node(dev)) == 0) return (0); - if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (0); buf = name; result = 0; while (len > 0) { if (strcmp(name, hwmod) == 0) { result = 1; break; } /* Slide to the next sub-string. */ l = strlen(name) + 1; name += l; len -= l; } OF_prop_free(buf); return (result); } int ti_hwmods_get_unit(device_t dev, const char *hwmod) { phandle_t node; int l, len, hwmodlen, result; char *name; char *buf; if ((node = ofw_bus_get_node(dev)) == 0) return (0); - if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (0); buf = name; hwmodlen = strlen(hwmod); result = 0; while (len > 0) { if (strncmp(name, hwmod, hwmodlen) == 0) { result = (int)strtoul(name + hwmodlen, NULL, 10); break; } /* Slide to the next sub-string. */ l = strlen(name) + 1; name += l; len -= l; } OF_prop_free(buf); return (result); } Index: projects/fuse2/sys/arm64/arm64/pmap.c =================================================================== --- projects/fuse2/sys/arm64/arm64/pmap.c (revision 350434) +++ projects/fuse2/sys/arm64/arm64/pmap.c (revision 350435) @@ -1,5930 +1,5930 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is * dirty. This flag may only be set on managed mappings. */ static pt_entry_t ATTR_SW_DBM; struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; #ifdef notyet /* Determine whether the hardware implements DBM management. */ uint64_t reg = READ_SPECIALREG(ID_AA64MMFR1_EL1); ATTR_SW_DBM = ID_AA64MMFR1_HAFDBS(reg) == ID_AA64MMFR1_HAFDBS_AF_DBS ? ATTR_DBM : _ATTR_SW_DBM; #else ATTR_SW_DBM = _ATTR_SW_DBM; #endif kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = arm_physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); va = KERNBASE; start_pa = pa = KERNBASE - kern_delta; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } static __inline void pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { sched_pin(); pmap_invalidate_range_nopin(pmap, sva, eva); sched_unpin(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_paddr_t pa; vm_page_t m; int lvl; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } if (vm_page_pa_tryrelock(pmap, (tpte & ~ATTR_MASK) | off, &pa)) goto retry; m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); vm_page_wire(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pa = 0; pte = pmap_pte(kernel_pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_kextract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_kextract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_kextract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & ATTR_AF) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l2, paddr | L2_TABLE); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_pte_dirty(old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (pmap_unuse_pt(pmap, sva, l2e, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ retry: if ((old_l2 & mask) == nbits) return; /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) goto retry; /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_AP(ATTR_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_XN; nbits |= ATTR_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) goto retry; if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* Clear the old mapping */ pmap_clear(pte); pmap_invalidate_range_nopin(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); critical_exit(); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); setl2: if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, then other can * access this page before caches are properly synced. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_XN))) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ /* XXXMJ need to reload orig_l3 for hardware DBM. */ pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 if (pmap != pmap_kernel() && (mpte == NULL || mpte->wire_count == NL3PG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l2 |= ATTR_XN; if (va < VM_MAXUSER_ADDRESS) new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, the TLB could * nonetheless have intermediate entries that * refer to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation (See * _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) l3_val |= ATTR_XN; else if (va < VM_MAXUSER_ADDRESS) l3_val |= ATTR_PXN; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_l2pg, dstmpte, srcmpte; if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) { va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL); if (dst_l2pg == NULL) break; l2 = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg)); l2 = &l2[pmap_l2_index(addr)]; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((srcptepaddr & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(l2, (srcptepaddr & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else dst_l2pg->wire_count--; continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_l3(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * the TLB could nonetheless have * intermediate entries that refer * to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); retry: if ((oldpte & ATTR_SW_DBM) != 0) { if (!atomic_fcmpset_long(pte, &oldpte, (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; vm_offset_t va, va_next; vm_page_t m; pd_entry_t *l0, *l1, *l2, oldl2; pt_entry_t *l3, oldl3; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); oldl2 = pmap_load(l2); if (oldl2 == 0) continue; if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { if ((oldl2 & ATTR_SW_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The 2MB page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldl2 & ATTR_SW_WIRED) == 0) { l3 = pmap_l2_to_l3(l2, sva); KASSERT(pmap_load(l3) != 0, ("pmap_advise: invalid PTE")); pmap_remove_l3(pmap, l3, sva, pmap_load(l2), NULL, &lock); } if (lock != NULL) rw_wunlock(lock); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_advise: invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != (ATTR_SW_MANAGED | L3_PAGE)) goto maybe_invlrng; else if (pmap_pte_dirty(oldl3)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); vm_page_dirty(m); } while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); } else if ((oldl3 & ATTR_AF) != 0) pmap_clear_bits(l3, ATTR_AF); else goto maybe_invlrng; if (va == va_next) va = sva; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ if ((oldl2 & ATTR_SW_DBM) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & ATTR_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); l3 = pmap_l2_to_l3(l2, va); oldl3 = pmap_load(l3); while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if (pmap_l3_valid(oldl3) && (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, CACHED_MEMORY); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, tmpva, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); if (mode == DEVICE_MEMORY) l3 |= ATTR_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) { ml3->wire_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_LOCK(pmap); retry: val = 0; pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_SUPER; if ((managed && pmap_pte_dirty(tpte)) || (!managed && (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else managed = false; if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0); __asm __volatile( "msr ttbr0_el1, %0 \n" "isb \n" : : "r"(td->td_proc->p_md.md_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } struct pcb * pmap_switch(struct thread *old, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (old == NULL || old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) { __asm __volatile( /* Switch to the new pmap */ "msr ttbr0_el1, %0 \n" "isb \n" /* Invalidate the TLB */ "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n" : : "r"(new->td_proc->p_md.md_l0addr)); /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t *pte; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); pte = pmap_pte(pmap, far, &lvl); if (pte != NULL) { pmap_set_bits(pte, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, far, &lvl); if (pte != NULL && (pmap_load(pte) & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { pmap_clear_bits(pte, ATTR_AP_RW_BIT); - pmap_invalidate_page(pmap, trunc_page(far)); + pmap_invalidate_page(pmap, far); rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: PMAP_LOCK(pmap); /* Ask the MMU to check the address */ intr = intr_disable(); if (pmap == kernel_pmap) par = arm64_address_translate_s1e1r(far); else par = arm64_address_translate_s1e0r(far); intr_restore(intr); PMAP_UNLOCK(pmap); /* * If the translation was successful the address was invalid * due to a break-before-make sequence. We can unlock and * return success to the trap handler. */ if (PAR_SUCCESS(par)) rv = KERN_SUCCESS; break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } Index: projects/fuse2/sys/arm64/conf/GENERIC =================================================================== --- projects/fuse2/sys/arm64/conf/GENERIC (revision 350434) +++ projects/fuse2/sys/arm64/conf/GENERIC (revision 350435) @@ -1,327 +1,328 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/arm64 # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # https://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ cpu ARM64 ident GENERIC makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols makeoptions WITH_CTF=1 # Run ctfconvert(1) for DTrace support options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption options VIMAGE # Subsystem virtualization, e.g. VNET options INET # InterNETworking options INET6 # IPv6 communications protocols options IPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5 options TCP_HHOOK # hhook(9) framework for TCP options TCP_OFFLOAD # TCP offload options TCP_RFC7413 # TCP Fast Open options SCTP # Stream Control Transmission Protocol options FFS # Berkeley Fast Filesystem options SOFTUPDATES # Enable FFS soft updates support options UFS_ACL # Support for access control lists options UFS_DIRHASH # Improve performance on big directories options UFS_GJOURNAL # Enable gjournal-based UFS journaling options QUOTA # Enable disk quotas for UFS options MD_ROOT # MD is a potential root device options NFSCL # Network Filesystem Client options NFSD # Network Filesystem Server options NFSLOCKD # Network Lock Manager options NFS_ROOT # NFS usable as /, requires NFSCL options MSDOSFS # MSDOS Filesystem options CD9660 # ISO 9660 Filesystem options PROCFS # Process filesystem (requires PSEUDOFS) options PSEUDOFS # Pseudo-filesystem framework options GEOM_RAID # Soft RAID functionality. options GEOM_LABEL # Provides labelization options COMPAT_FREEBSD32 # Compatible with FreeBSD/arm options COMPAT_FREEBSD11 # Compatible with FreeBSD11 options COMPAT_FREEBSD12 # Compatible with FreeBSD12 options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI options KTRACE # ktrace(1) support options STACK # stack(9) support options SYSVSHM # SYSV-style shared memory options SYSVMSG # SYSV-style message queues options SYSVSEM # SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options CAPABILITY_MODE # Capsicum capability mode options CAPABILITIES # Capsicum capabilities options MAC # TrustedBSD MAC Framework options KDTRACE_FRAME # Ensure frames are compiled in options KDTRACE_HOOKS # Kernel DTrace hooks options VFP # Floating-point support options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default options RCTL # Resource limits options SMP options INTRNG # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. # For full debugger support use (turn off in stable branch): options DDB # Support DDB. #options GDB # Support remote GDB. options DEADLKRES # Enable the deadlock resolver options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS options WITNESS # Enable checks to detect deadlocks and cycles options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones options ALT_BREAK_TO_DEBUGGER # Enter debugger on keyboard escape sequence options USB_DEBUG # enable debug msgs options VERBOSE_SYSINIT=0 # Support debug.verbose_sysinit, off by default # Kernel Sanitizers #options COVERAGE # Generic kernel coverage. Used by KCOV #options KCOV # Kernel Coverage Sanitizer # Warning: KUBSAN can result in a kernel too large for loader to load #options KUBSAN # Kernel Undefined Behavior Sanitizer # Kernel dump features. options EKCD # Support for encrypted kernel dumps options GZIO # gzip-compressed kernel and user dumps options ZSTDIO # zstd-compressed kernel and user dumps options NETDUMP # netdump(4) client support # SoC support options SOC_ALLWINNER_A64 options SOC_ALLWINNER_H5 options SOC_CAVM_THUNDERX options SOC_HISI_HI6220 options SOC_BRCM_BCM2837 options SOC_MARVELL_8K options SOC_ROCKCHIP_RK3328 options SOC_ROCKCHIP_RK3399 options SOC_XILINX_ZYNQ # Timer drivers device a10_timer # Annapurna Alpine drivers device al_ccu # Alpine Cache Coherency Unit device al_nb_service # Alpine North Bridge Service device al_iofic # I/O Fabric Interrupt Controller device al_serdes # Serializer/Deserializer device al_udma # Universal DMA # Qualcomm Snapdragon drivers device qcom_gcc # Global Clock Controller # VirtIO support device virtio device virtio_pci device virtio_mmio device virtio_blk device vtnet # CPU frequency control device cpufreq # Bus drivers device pci device al_pci # Annapurna Alpine PCI-E options PCI_HP # PCI-Express native HotPlug options PCI_IOV # PCI SR-IOV support # PCI/PCI-X/PCIe Ethernet NICs that use iflib infrastructure device iflib device em # Intel PRO/1000 Gigabit Ethernet Family device ix # Intel 10Gb Ethernet Family # Ethernet NICs device mdio device mii device miibus # MII bus support device awg # Allwinner EMAC Gigabit Ethernet device axgbe # AMD Opteron A1100 integrated NIC device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet device neta # Marvell Armada 370/38x/XP/3700 NIC device smc # SMSC LAN91C111 device vnic # Cavium ThunderX NIC device al_eth # Annapurna Alpine Ethernet NIC device dwc_rk # Rockchip Designware +device dwc_socfpga # Altera SOCFPGA Ethernet MAC # Etherswitch devices device etherswitch # Enable etherswitch support device miiproxy # Required for etherswitch device e6000sw # Marvell mv88e6085 based switches # Block devices device ahci device scbus device da # ATA/SCSI peripherals device pass # Passthrough device (direct ATA/SCSI access) # NVM Express (NVMe) support device nvme # base NVMe driver options NVME_USE_NVD=0 # prefer the cam(4) based nda(4) driver device nvd # expose NVMe namespaces as disks, depends on nvme # MMC/SD/SDIO Card slot support device sdhci device sdhci_xenon # Marvell Xenon SD/MMC controller device aw_mmc # Allwinner SD/MMC controller device mmc # mmc/sd bus device mmcsd # mmc/sd flash cards device dwmmc device rk_emmcphy # Serial (COM) ports device uart # Generic UART driver device uart_msm # Qualcomm MSM UART driver device uart_mu # RPI3 aux port device uart_mvebu # Armada 3700 UART driver device uart_ns8250 # ns8250-type UART driver device uart_snps device pl011 # USB support device aw_ehci # Allwinner EHCI USB interface (USB 2.0) device aw_usbphy # Allwinner USB PHY device dwcotg # DWC OTG controller device ohci # OHCI USB interface device ehci # EHCI USB interface (USB 2.0) device ehci_mv # Marvell EHCI USB interface device xhci # XHCI USB interface (USB 3.0) device usb # USB Bus (required) device ukbd # Keyboard device umass # Disks/Mass storage - Requires scbus and da # USB ethernet support device muge device smcphy device smsc # Sound support device sound device a10_codec # DMA controller device a31_dmac # GPIO / PINCTRL device a37x0_gpio # Marvell Armada 37x0 GPIO controller device aw_gpio # Allwinner GPIO controller device gpio device gpioled device fdt_pinctrl device gpioregulator device mv_gpio # Marvell GPIO controller device mvebu_pinctrl # Marvell Pinmux Controller device rk_gpio # RockChip GPIO Controller device rk_pinctrl # RockChip Pinmux Controller # I2C device aw_rsb # Allwinner Reduced Serial Bus device bcm2835_bsc # Broadcom BCM283x I2C bus device iicbus device iic device twsi # Allwinner I2C controller device rk_i2c # RockChip I2C controller device syr827 # Silergy SYR827 PMIC device sy8106a # SY8106A Buck Regulator # Clock and reset controllers device aw_ccu # Allwinner clock controller # Interrupt controllers device aw_nmi # Allwinner NMI support device mv_cp110_icu # Marvell CP110 ICU device mv_ap806_gicp # Marvell AP806 GICP # Real-time clock support device aw_rtc # Allwinner Real-time Clock device mv_rtc # Marvell Real-time Clock # Watchdog controllers device aw_wdog # Allwinner Watchdog # Power management controllers device axp81x # X-Powers AXP81x PMIC device rk805 # RockChip RK805 PMIC # EFUSE device aw_sid # Allwinner Secure ID EFUSE # Thermal sensors device aw_thermal # Allwinner Thermal Sensor Controller device mv_thermal # Marvell Thermal Sensor Controller # SPI device spibus device bcm2835_spi # Broadcom BCM283x SPI bus # PWM device pwm device aw_pwm # Console device vt device kbdmux device vt_efifb # EVDEV support device evdev # input event device support options EVDEV_SUPPORT # evdev support in legacy drivers device uinput # install /dev/uinput cdev device aw_cir # Pseudo devices. device crypto # core crypto support device loop # Network loopback device ether # Ethernet support device vlan # 802.1Q VLAN support device tuntap # Packet tunnel. device md # Memory "disks" device gif # IPv6 and IPv4 tunneling device firmware # firmware assist module options EFIRT # EFI Runtime Services # EXT_RESOURCES pseudo devices options EXT_RESOURCES device clk device phy device hwreset device nvmem device regulator device syscon device aw_syscon # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter # Chip-specific errata options THUNDERX_PASS_1_1_ERRATA options FDT device acpi # DTBs makeoptions MODULES_EXTRA="dtb/allwinner dtb/rockchip dtb/rpi" Index: projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c =================================================================== --- projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c (revision 350434) +++ projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c (revision 350435) @@ -1,156 +1,157 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include +#include #include #include #include #include #include #include #ifdef CAPABILITIES MALLOC_DECLARE(M_FILECAPS); int freebsd32_cap_ioctls_limit(struct thread *td, struct freebsd32_cap_ioctls_limit_args *uap) { u_long *cmds; uint32_t *cmds32; size_t ncmds; u_int i; int error; ncmds = uap->ncmds; if (ncmds > 256) /* XXX: Is 256 sane? */ return (EINVAL); if (ncmds == 0) { cmds = NULL; } else { cmds32 = malloc(sizeof(cmds32[0]) * ncmds, M_FILECAPS, M_WAITOK); error = copyin(uap->cmds, cmds32, sizeof(cmds32[0]) * ncmds); if (error != 0) { free(cmds32, M_FILECAPS); return (error); } cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK); for (i = 0; i < ncmds; i++) cmds[i] = cmds32[i]; free(cmds32, M_FILECAPS); } return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds)); } int freebsd32_cap_ioctls_get(struct thread *td, struct freebsd32_cap_ioctls_get_args *uap) { struct filedesc *fdp; struct filedescent *fdep; uint32_t *cmds32; u_long *cmds; size_t maxcmds; int error, fd; u_int i; fd = uap->fd; cmds32 = uap->cmds; maxcmds = uap->maxcmds; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { error = EBADF; goto out; } /* * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL) * the only sane thing we can do is to not populate the given array and * return CAP_IOCTLS_ALL (actually, INT_MAX). */ fdep = &fdp->fd_ofiles[fd]; cmds = fdep->fde_ioctls; if (cmds32 != NULL && cmds != NULL) { for (i = 0; i < MIN(fdep->fde_nioctls, maxcmds); i++) { error = suword32(&cmds32[i], cmds[i]); if (error != 0) goto out; } } if (fdep->fde_nioctls == -1) td->td_retval[0] = INT_MAX; else td->td_retval[0] = fdep->fde_nioctls; error = 0; out: FILEDESC_SUNLOCK(fdp); return (error); } #else /* !CAPABILITIES */ int freebsd32_cap_ioctls_limit(struct thread *td, struct freebsd32_cap_ioctls_limit_args *uap) { return (ENOSYS); } int freebsd32_cap_ioctls_get(struct thread *td, struct freebsd32_cap_ioctls_get_args *uap) { return (ENOSYS); } #endif /* CAPABILITIES */ Index: projects/fuse2/sys/conf/files.arm64 =================================================================== --- projects/fuse2/sys/conf/files.arm64 (revision 350434) +++ projects/fuse2/sys/conf/files.arm64 (revision 350435) @@ -1,291 +1,292 @@ # $FreeBSD$ cloudabi32_vdso.o optional compat_cloudabi32 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_armv6_on_64bit.S" \ compile-with "${CC} -x assembler-with-cpp -m32 -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_armv6_on_64bit.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi32_vdso.o" # cloudabi32_vdso_blob.o optional compat_cloudabi32 \ dependency "cloudabi32_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-littleaarch64 --binary-architecture aarch64 cloudabi32_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi32_vdso_blob.o" # cloudabi64_vdso.o optional compat_cloudabi64 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_aarch64.S" \ compile-with "${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_aarch64.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi64_vdso.o" # cloudabi64_vdso_blob.o optional compat_cloudabi64 \ dependency "cloudabi64_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-littleaarch64 --binary-architecture aarch64 cloudabi64_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi64_vdso_blob.o" # # Allwinner common files arm/allwinner/a10_ehci.c optional ehci aw_ehci fdt arm/allwinner/a10_timer.c optional a10_timer fdt arm/allwinner/a10_codec.c optional sound a10_codec arm/allwinner/a31_dmac.c optional a31_dmac arm/allwinner/sunxi_dma_if.m optional a31_dmac arm/allwinner/aw_cir.c optional evdev aw_cir fdt arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt arm/allwinner/aw_mmc.c optional mmc aw_mmc fdt | mmccam aw_mmc fdt arm/allwinner/aw_nmi.c optional aw_nmi fdt \ compile-with "${NORMAL_C} -I$S/gnu/dts/include" arm/allwinner/aw_pwm.c optional aw_pwm fdt arm/allwinner/aw_rsb.c optional aw_rsb fdt arm/allwinner/aw_rtc.c optional aw_rtc fdt arm/allwinner/aw_sid.c optional aw_sid nvmem fdt arm/allwinner/aw_spi.c optional aw_spi fdt arm/allwinner/aw_syscon.c optional aw_syscon ext_resources syscon fdt arm/allwinner/aw_thermal.c optional aw_thermal nvmem fdt arm/allwinner/aw_usbphy.c optional ehci aw_usbphy fdt arm/allwinner/aw_wdog.c optional aw_wdog fdt arm/allwinner/axp81x.c optional axp81x fdt arm/allwinner/if_awg.c optional awg ext_resources syscon aw_sid nvmem fdt # Allwinner clock driver arm/allwinner/clkng/aw_ccung.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_frac.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_nkmp.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_nm.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_prediv_mux.c optional aw_ccu fdt arm/allwinner/clkng/ccu_a64.c optional soc_allwinner_a64 aw_ccu fdt arm/allwinner/clkng/ccu_h3.c optional soc_allwinner_h5 aw_ccu fdt arm/allwinner/clkng/ccu_sun8i_r.c optional aw_ccu fdt arm/allwinner/clkng/ccu_de2.c optional aw_ccu fdt # Allwinner padconf files arm/allwinner/a64/a64_padconf.c optional soc_allwinner_a64 fdt arm/allwinner/a64/a64_r_padconf.c optional soc_allwinner_a64 fdt arm/allwinner/h3/h3_padconf.c optional soc_allwinner_h5 fdt arm/allwinner/h3/h3_r_padconf.c optional soc_allwinner_h5 fdt arm/annapurna/alpine/alpine_ccu.c optional al_ccu fdt arm/annapurna/alpine/alpine_nb_service.c optional al_nb_service fdt arm/annapurna/alpine/alpine_pci.c optional al_pci fdt arm/annapurna/alpine/alpine_pci_msix.c optional al_pci fdt arm/annapurna/alpine/alpine_serdes.c optional al_serdes fdt \ no-depend \ compile-with "${CC} -c -o ${.TARGET} ${CFLAGS} -I$S/contrib/alpine-hal -I$S/contrib/alpine-hal/eth ${PROF} ${.IMPSRC}" arm/arm/generic_timer.c standard arm/arm/gic.c standard arm/arm/gic_acpi.c optional acpi arm/arm/gic_fdt.c optional fdt arm/arm/pmu.c standard arm/arm/physmem.c standard arm/broadcom/bcm2835/bcm2835_audio.c optional sound vchiq fdt \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" arm/broadcom/bcm2835/bcm2835_bsc.c optional bcm2835_bsc soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_cpufreq.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_dma.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_fbd.c optional vt soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_ft5406.c optional evdev bcm2835_ft5406 soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_gpio.c optional gpio soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_intr.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_mbox.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_rng.c optional !random_loadable soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_sdhci.c optional sdhci soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_sdhost.c optional sdhci soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_spi.c optional bcm2835_spi soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_vcio.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_wdog.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2836.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm283x_dwc_fdt.c optional dwcotg fdt soc_brcm_bcm2837 arm/mv/a37x0_gpio.c optional a37x0_gpio gpio fdt arm/mv/gpio.c optional mv_gpio fdt arm/mv/mvebu_pinctrl.c optional mvebu_pinctrl fdt arm/mv/mv_cp110_icu.c optional mv_cp110_icu fdt arm/mv/mv_ap806_gicp.c optional mv_ap806_gicp fdt arm/mv/mv_ap806_clock.c optional SOC_MARVELL_8K fdt arm/mv/mv_cp110_clock.c optional SOC_MARVELL_8K fdt arm/mv/mv_thermal.c optional SOC_MARVELL_8K mv_thermal fdt arm/mv/armada38x/armada38x_rtc.c optional mv_rtc fdt arm/xilinx/uart_dev_cdnc.c optional uart soc_xilinx_zynq arm64/acpica/acpi_iort.c optional acpi arm64/acpica/acpi_machdep.c optional acpi arm64/acpica/OsdEnvironment.c optional acpi arm64/acpica/acpi_wakeup.c optional acpi arm64/acpica/pci_cfgreg.c optional acpi pci arm64/arm64/autoconf.c standard arm64/arm64/bus_machdep.c standard arm64/arm64/bus_space_asm.S standard arm64/arm64/busdma_bounce.c standard arm64/arm64/busdma_machdep.c standard arm64/arm64/bzero.S standard arm64/arm64/clock.c standard arm64/arm64/copyinout.S standard arm64/arm64/copystr.c standard arm64/arm64/cpu_errata.c standard arm64/arm64/cpufunc_asm.S standard arm64/arm64/db_disasm.c optional ddb arm64/arm64/db_interface.c optional ddb arm64/arm64/db_trace.c optional ddb arm64/arm64/debug_monitor.c optional ddb arm64/arm64/disassem.c optional ddb arm64/arm64/dump_machdep.c standard arm64/arm64/efirt_machdep.c optional efirt arm64/arm64/elf32_machdep.c optional compat_freebsd32 arm64/arm64/elf_machdep.c standard arm64/arm64/exception.S standard arm64/arm64/freebsd32_machdep.c optional compat_freebsd32 arm64/arm64/gicv3_its.c optional intrng fdt arm64/arm64/gic_v3.c standard arm64/arm64/gic_v3_acpi.c optional acpi arm64/arm64/gic_v3_fdt.c optional fdt arm64/arm64/identcpu.c standard arm64/arm64/in_cksum.c optional inet | inet6 arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/mem.c standard arm64/arm64/memcpy.S standard arm64/arm64/memmove.S standard arm64/arm64/minidump_machdep.c standard arm64/arm64/mp_machdep.c optional smp arm64/arm64/nexus.c standard arm64/arm64/ofw_machdep.c optional fdt arm64/arm64/pmap.c standard arm64/arm64/stack_machdep.c optional ddb | stack arm64/arm64/support.S standard arm64/arm64/swtch.S standard arm64/arm64/sys_machdep.c standard arm64/arm64/trap.c standard arm64/arm64/uio_machdep.c standard arm64/arm64/uma_machdep.c standard arm64/arm64/undefined.c standard arm64/arm64/unwind.c optional ddb | kdtrace_hooks | stack arm64/arm64/vfp.c standard arm64/arm64/vm_machdep.c standard arm64/cavium/thunder_pcie_fdt.c optional soc_cavm_thunderx pci fdt arm64/cavium/thunder_pcie_pem.c optional soc_cavm_thunderx pci arm64/cavium/thunder_pcie_pem_fdt.c optional soc_cavm_thunderx pci fdt arm64/cavium/thunder_pcie_common.c optional soc_cavm_thunderx pci arm64/cloudabi32/cloudabi32_sysvec.c optional compat_cloudabi32 arm64/cloudabi64/cloudabi64_sysvec.c optional compat_cloudabi64 arm64/coresight/coresight.c standard arm64/coresight/coresight_if.m standard arm64/coresight/coresight-cmd.c standard arm64/coresight/coresight-cpu-debug.c standard arm64/coresight/coresight-dynamic-replicator.c standard arm64/coresight/coresight-etm4x.c standard arm64/coresight/coresight-funnel.c standard arm64/coresight/coresight-tmc.c standard arm64/qualcomm/qcom_gcc.c optional qcom_gcc fdt contrib/vchiq/interface/compat/vchi_bsd.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_arm.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_connected.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_core.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_shim.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_util.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" crypto/armv8/armv8_crypto.c optional armv8crypto armv8_crypto_wrap.o optional armv8crypto \ dependency "$S/crypto/armv8/armv8_crypto_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} -I$S/crypto/armv8/ ${WERROR} ${NO_WCAST_QUAL} ${PROF} -march=armv8-a+crypto ${.IMPSRC}" \ no-implicit-rule \ clean "armv8_crypto_wrap.o" crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support crypto/des/des_enc.c optional crypto | ipsec | ipsec_support | netsmb dev/acpica/acpi_bus_if.m optional acpi dev/acpica/acpi_if.m optional acpi dev/acpica/acpi_pci_link.c optional acpi pci dev/acpica/acpi_pcib.c optional acpi pci dev/acpica/acpi_pxm.c optional acpi dev/ahci/ahci_generic.c optional ahci +dev/altera/dwc/if_dwc_socfpga.c optional fdt dwc_socfpga dev/axgbe/if_axgbe.c optional axgbe dev/axgbe/xgbe-desc.c optional axgbe dev/axgbe/xgbe-dev.c optional axgbe dev/axgbe/xgbe-drv.c optional axgbe dev/axgbe/xgbe-mdio.c optional axgbe dev/cpufreq/cpufreq_dt.c optional cpufreq fdt dev/iicbus/sy8106a.c optional sy8106a fdt dev/iicbus/twsi/mv_twsi.c optional twsi fdt dev/iicbus/twsi/a10_twsi.c optional twsi fdt dev/iicbus/twsi/twsi.c optional twsi fdt dev/hwpmc/hwpmc_arm64.c optional hwpmc dev/hwpmc/hwpmc_arm64_md.c optional hwpmc dev/mbox/mbox_if.m optional soc_brcm_bcm2837 dev/mmc/host/dwmmc.c optional dwmmc fdt dev/mmc/host/dwmmc_hisi.c optional dwmmc fdt soc_hisi_hi6220 dev/mmc/host/dwmmc_rockchip.c optional dwmmc fdt soc_rockchip_rk3328 dev/neta/if_mvneta_fdt.c optional neta fdt dev/neta/if_mvneta.c optional neta mdio mii dev/ofw/ofw_cpu.c optional fdt dev/ofw/ofwpci.c optional fdt pci dev/pci/pci_host_generic.c optional pci dev/pci/pci_host_generic_acpi.c optional pci acpi dev/pci/pci_host_generic_fdt.c optional pci fdt dev/psci/psci.c standard dev/psci/psci_arm64.S standard dev/psci/smccc.c standard dev/sdhci/sdhci_xenon.c optional sdhci_xenon sdhci fdt dev/uart/uart_cpu_arm64.c optional uart dev/uart/uart_dev_mu.c optional uart uart_mu dev/uart/uart_dev_pl011.c optional uart pl011 dev/usb/controller/dwc_otg_hisi.c optional dwcotg fdt soc_hisi_hi6220 dev/usb/controller/ehci_mv.c optional ehci_mv fdt dev/usb/controller/generic_ehci.c optional ehci acpi dev/usb/controller/generic_ohci.c optional ohci fdt dev/usb/controller/generic_usb_if.m optional ohci fdt dev/usb/controller/usb_nop_xceiv.c optional fdt ext_resources dev/usb/controller/generic_xhci.c optional xhci dev/usb/controller/generic_xhci_acpi.c optional xhci acpi dev/usb/controller/generic_xhci_fdt.c optional xhci fdt dev/vnic/mrml_bridge.c optional vnic fdt dev/vnic/nic_main.c optional vnic pci dev/vnic/nicvf_main.c optional vnic pci pci_iov dev/vnic/nicvf_queues.c optional vnic pci pci_iov dev/vnic/thunder_bgx_fdt.c optional vnic fdt dev/vnic/thunder_bgx.c optional vnic pci dev/vnic/thunder_mdio_fdt.c optional vnic fdt dev/vnic/thunder_mdio.c optional vnic dev/vnic/lmac_if.m optional inet | inet6 | vnic kern/kern_clocksource.c standard kern/msi_if.m optional intrng kern/pic_if.m optional intrng kern/subr_devmap.c standard kern/subr_intr.c optional intrng libkern/bcmp.c standard libkern/memcmp.c standard libkern/memset.c standard libkern/arm64/crc32c_armv8.S standard cddl/contrib/opensolaris/common/atomic/aarch64/opensolaris_atomic.S optional zfs | dtrace compile-with "${CDDL_C}" cddl/dev/dtrace/aarch64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/aarch64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/aarch64/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" # RockChip Drivers arm64/rockchip/rk3399_emmcphy.c optional fdt rk_emmcphy soc_rockchip_rk3399 arm64/rockchip/rk_i2c.c optional fdt rk_i2c soc_rockchip_rk3328 | fdt rk_i2c soc_rockchip_rk3399 arm64/rockchip/rk805.c optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399 arm64/rockchip/rk_grf.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/rk_pinctrl.c optional fdt rk_pinctrl soc_rockchip_rk3328 | fdt rk_pinctrl soc_rockchip_rk3399 arm64/rockchip/rk_gpio.c optional fdt rk_gpio soc_rockchip_rk3328 | fdt rk_gpio soc_rockchip_rk3399 arm64/rockchip/if_dwc_rk.c optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399 dev/dwc/if_dwc.c optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399 dev/dwc/if_dwc_if.m optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399 # RockChip Clock support arm64/rockchip/clk/rk_cru.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk_clk_armclk.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk_clk_composite.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk_clk_gate.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk_clk_mux.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk_clk_pll.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk3328_cru.c optional fdt soc_rockchip_rk3328 arm64/rockchip/clk/rk3399_cru.c optional fdt soc_rockchip_rk3399 arm64/rockchip/clk/rk3399_pmucru.c optional fdt soc_rockchip_rk3399 Index: projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c =================================================================== --- projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c (nonexistent) +++ projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c (revision 350435) @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Ruslan Bukin + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory (Department of Computer Science and + * Technology) under DARPA contract HR0011-18-C-0016 ("ECATS"), as part of the + * DARPA SSITH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "if_dwc_if.h" + +static int +if_dwc_socfpga_probe(device_t dev) +{ + + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + + if (!ofw_bus_is_compatible(dev, "altr,socfpga-stmmac")) + return (ENXIO); + + device_set_desc(dev, "Altera SOCFPGA Ethernet MAC"); + + return (BUS_PROBE_DEFAULT); +} + +static int +if_dwc_socfpga_init(device_t dev) +{ + + return (0); +} + +static int +if_dwc_socfpga_mac_type(device_t dev) +{ + + return (DWC_GMAC); +} + +static int +if_dwc_socfpga_mii_clk(device_t dev) +{ + phandle_t root; + + root = OF_finddevice("/"); + + if (ofw_bus_node_is_compatible(root, "altr,socfpga-stratix10")) + return (GMAC_MII_CLK_35_60M_DIV26); + + /* Default value. */ + return (GMAC_MII_CLK_25_35M_DIV16); +} + +static device_method_t dwc_socfpga_methods[] = { + DEVMETHOD(device_probe, if_dwc_socfpga_probe), + + DEVMETHOD(if_dwc_init, if_dwc_socfpga_init), + DEVMETHOD(if_dwc_mac_type, if_dwc_socfpga_mac_type), + DEVMETHOD(if_dwc_mii_clk, if_dwc_socfpga_mii_clk), + + DEVMETHOD_END +}; + +static devclass_t dwc_socfpga_devclass; + +extern driver_t dwc_driver; + +DEFINE_CLASS_1(dwc, dwc_socfpga_driver, dwc_socfpga_methods, + sizeof(struct dwc_softc), dwc_driver); +EARLY_DRIVER_MODULE(dwc_socfpga, simplebus, dwc_socfpga_driver, + dwc_socfpga_devclass, 0, 0, BUS_PASS_SUPPORTDEV + BUS_PASS_ORDER_MIDDLE); + +MODULE_DEPEND(dwc_socfpga, dwc, 1, 1, 1); Property changes on: projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c =================================================================== --- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c (revision 350434) +++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c (revision 350435) @@ -1,885 +1,886 @@ /*- * Copyright (c) 2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include +#include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_datavar.h" #include "bhnd_nvram_data_tlvreg.h" /* * CFE TLV NVRAM data class. * * The CFE-defined TLV NVRAM format is used on the WGT634U. */ struct bhnd_nvram_tlv { struct bhnd_nvram_data nv; /**< common instance state */ struct bhnd_nvram_io *data; /**< backing buffer */ size_t count; /**< variable count */ }; BHND_NVRAM_DATA_CLASS_DEFN(tlv, "WGT634U", BHND_NVRAM_DATA_CAP_DEVPATHS, sizeof(struct bhnd_nvram_tlv)) /** Minimal TLV_ENV record header */ struct bhnd_nvram_tlv_env_hdr { uint8_t tag; uint8_t size; } __packed; /** Minimal TLV_ENV record */ struct bhnd_nvram_tlv_env { struct bhnd_nvram_tlv_env_hdr hdr; uint8_t flags; char envp[]; } __packed; /* Return the length in bytes of an TLV_ENV's envp data */ #define NVRAM_TLV_ENVP_DATA_LEN(_env) \ (((_env)->hdr.size < sizeof((_env)->flags)) ? 0 : \ ((_env)->hdr.size - sizeof((_env)->flags))) /* Maximum supported length of the envp data field, in bytes */ #define NVRAM_TLV_ENVP_DATA_MAX_LEN \ (UINT8_MAX - sizeof(uint8_t) /* flags */) static int bhnd_nvram_tlv_parse_size( struct bhnd_nvram_io *io, size_t *size); static int bhnd_nvram_tlv_next_record( struct bhnd_nvram_io *io, size_t *next, size_t *offset, uint8_t *tag); static struct bhnd_nvram_tlv_env *bhnd_nvram_tlv_next_env( struct bhnd_nvram_tlv *tlv, size_t *next, void **cookiep); static struct bhnd_nvram_tlv_env *bhnd_nvram_tlv_get_env( struct bhnd_nvram_tlv *tlv, void *cookiep); static void *bhnd_nvram_tlv_to_cookie( struct bhnd_nvram_tlv *tlv, size_t io_offset); static size_t bhnd_nvram_tlv_to_offset( struct bhnd_nvram_tlv *tlv, void *cookiep); static int bhnd_nvram_tlv_probe(struct bhnd_nvram_io *io) { struct bhnd_nvram_tlv_env ident; size_t nbytes; int error; nbytes = bhnd_nvram_io_getsize(io); /* Handle what might be an empty TLV image */ if (nbytes < sizeof(ident)) { uint8_t tag; /* Fetch just the first tag */ error = bhnd_nvram_io_read(io, 0x0, &tag, sizeof(tag)); if (error) return (error); /* This *could* be an empty TLV image, but all we're * testing for here is a single 0x0 byte followed by EOF */ if (tag == NVRAM_TLV_TYPE_END) return (BHND_NVRAM_DATA_PROBE_MAYBE); return (ENXIO); } /* Otherwise, look at the initial header for a valid TLV ENV tag, * plus one byte of the entry data */ error = bhnd_nvram_io_read(io, 0x0, &ident, sizeof(ident) + sizeof(ident.envp[0])); if (error) return (error); /* First entry should be a variable record (which we statically * assert as being defined to use a single byte size field) */ if (ident.hdr.tag != NVRAM_TLV_TYPE_ENV) return (ENXIO); _Static_assert(NVRAM_TLV_TYPE_ENV & NVRAM_TLV_TF_U8_LEN, "TYPE_ENV is not a U8-sized field"); /* The entry must be at least 3 characters ('x=\0') in length */ if (ident.hdr.size < 3) return (ENXIO); /* The first character should be a valid key char (alpha) */ if (!bhnd_nv_isalpha(ident.envp[0])) return (ENXIO); return (BHND_NVRAM_DATA_PROBE_DEFAULT); } static int bhnd_nvram_tlv_getvar_direct(struct bhnd_nvram_io *io, const char *name, void *buf, size_t *len, bhnd_nvram_type type) { struct bhnd_nvram_tlv_env env; char data[NVRAM_TLV_ENVP_DATA_MAX_LEN]; size_t data_len; const char *key, *value; size_t keylen, vlen; size_t namelen; size_t next, off; uint8_t tag; int error; namelen = strlen(name); /* Iterate over the input looking for the requested variable */ next = 0; while (!(error = bhnd_nvram_tlv_next_record(io, &next, &off, &tag))) { switch (tag) { case NVRAM_TLV_TYPE_END: /* Not found */ return (ENOENT); case NVRAM_TLV_TYPE_ENV: /* Read the record header */ error = bhnd_nvram_io_read(io, off, &env, sizeof(env)); if (error) { BHND_NV_LOG("error reading TLV_ENV record " "header: %d\n", error); return (error); } /* Read the record data */ data_len = NVRAM_TLV_ENVP_DATA_LEN(&env); error = bhnd_nvram_io_read(io, off + sizeof(env), data, data_len); if (error) { BHND_NV_LOG("error reading TLV_ENV record " "data: %d\n", error); return (error); } /* Parse the key=value string */ error = bhnd_nvram_parse_env(data, data_len, '=', &key, &keylen, &value, &vlen); if (error) { BHND_NV_LOG("error parsing TLV_ENV data: %d\n", error); return (error); } /* Match against requested variable name */ if (keylen == namelen && strncmp(key, name, namelen) == 0) { return (bhnd_nvram_value_coerce(value, vlen, BHND_NVRAM_TYPE_STRING, buf, len, type)); } break; default: /* Skip unknown tags */ break; } } /* Hit I/O error */ return (error); } static int bhnd_nvram_tlv_serialize(bhnd_nvram_data_class *cls, bhnd_nvram_plist *props, bhnd_nvram_plist *options, void *outp, size_t *olen) { bhnd_nvram_prop *prop; size_t limit, nbytes; int error; /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; nbytes = 0; /* Write all properties */ prop = NULL; while ((prop = bhnd_nvram_plist_next(props, prop)) != NULL) { struct bhnd_nvram_tlv_env env; const char *name; uint8_t *p; size_t name_len, value_len; size_t rec_size; env.hdr.tag = NVRAM_TLV_TYPE_ENV; env.hdr.size = sizeof(env.flags); env.flags = 0x0; /* Fetch name value and add to record length */ name = bhnd_nvram_prop_name(prop); name_len = strlen(name) + 1 /* '=' */; if (UINT8_MAX - env.hdr.size < name_len) { BHND_NV_LOG("%s name exceeds maximum TLV record " "length\n", name); return (EFTYPE); /* would overflow TLV size */ } env.hdr.size += name_len; /* Add string value to record length */ error = bhnd_nvram_prop_encode(prop, NULL, &value_len, BHND_NVRAM_TYPE_STRING); if (error) { BHND_NV_LOG("error serializing %s to required type " "%s: %d\n", name, bhnd_nvram_type_name(BHND_NVRAM_TYPE_STRING), error); return (error); } if (UINT8_MAX - env.hdr.size < value_len) { BHND_NV_LOG("%s value exceeds maximum TLV record " "length\n", name); return (EFTYPE); /* would overflow TLV size */ } env.hdr.size += value_len; /* Calculate total record size */ rec_size = sizeof(env.hdr) + env.hdr.size; if (SIZE_MAX - nbytes < rec_size) return (EFTYPE); /* would overflow size_t */ /* Calculate our output pointer */ if (nbytes > limit || limit - nbytes < rec_size) { /* buffer is full; cannot write */ p = NULL; } else { p = (uint8_t *)outp + nbytes; } /* Write to output */ if (p != NULL) { memcpy(p, &env, sizeof(env)); p += sizeof(env); memcpy(p, name, name_len - 1); p[name_len - 1] = '='; p += name_len; error = bhnd_nvram_prop_encode(prop, p, &value_len, BHND_NVRAM_TYPE_STRING); if (error) { BHND_NV_LOG("error serializing %s to required " "type %s: %d\n", name, bhnd_nvram_type_name( BHND_NVRAM_TYPE_STRING), error); return (error); } } nbytes += rec_size; } /* Write terminating END record */ if (limit > nbytes) *((uint8_t *)outp + nbytes) = NVRAM_TLV_TYPE_END; if (nbytes == SIZE_MAX) return (EFTYPE); /* would overflow size_t */ nbytes++; /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Initialize @p tlv with the provided NVRAM TLV data mapped by @p src. * * @param tlv A newly allocated data instance. */ static int bhnd_nvram_tlv_init(struct bhnd_nvram_tlv *tlv, struct bhnd_nvram_io *src) { struct bhnd_nvram_tlv_env *env; size_t size; size_t next; int error; BHND_NV_ASSERT(tlv->data == NULL, ("tlv data already initialized")); /* Determine the actual size of the TLV source data */ if ((error = bhnd_nvram_tlv_parse_size(src, &size))) return (error); /* Copy to our own internal buffer */ if ((tlv->data = bhnd_nvram_iobuf_copy_range(src, 0x0, size)) == NULL) return (ENOMEM); /* Initialize our backing buffer */ tlv->count = 0; next = 0; while ((env = bhnd_nvram_tlv_next_env(tlv, &next, NULL)) != NULL) { size_t env_len; size_t name_len; /* TLV_ENV data must not be empty */ env_len = NVRAM_TLV_ENVP_DATA_LEN(env); if (env_len == 0) { BHND_NV_LOG("cannot parse zero-length TLV_ENV record " "data\n"); return (EINVAL); } /* Parse the key=value string, and then replace the '=' * delimiter with '\0' to allow us to provide direct * name pointers from our backing buffer */ error = bhnd_nvram_parse_env(env->envp, env_len, '=', NULL, &name_len, NULL, NULL); if (error) { BHND_NV_LOG("error parsing TLV_ENV data: %d\n", error); return (error); } /* Replace '=' with '\0' */ *(env->envp + name_len) = '\0'; /* Add to variable count */ tlv->count++; }; return (0); } static int bhnd_nvram_tlv_new(struct bhnd_nvram_data *nv, struct bhnd_nvram_io *io) { struct bhnd_nvram_tlv *tlv; int error; /* Allocate and initialize the TLV data instance */ tlv = (struct bhnd_nvram_tlv *)nv; /* Parse the TLV input data and initialize our backing * data representation */ if ((error = bhnd_nvram_tlv_init(tlv, io))) { bhnd_nvram_tlv_free(nv); return (error); } return (0); } static void bhnd_nvram_tlv_free(struct bhnd_nvram_data *nv) { struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv; if (tlv->data != NULL) bhnd_nvram_io_free(tlv->data); } size_t bhnd_nvram_tlv_count(struct bhnd_nvram_data *nv) { struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv; return (tlv->count); } static bhnd_nvram_plist * bhnd_nvram_tlv_options(struct bhnd_nvram_data *nv) { return (NULL); } static uint32_t bhnd_nvram_tlv_caps(struct bhnd_nvram_data *nv) { return (BHND_NVRAM_DATA_CAP_READ_PTR|BHND_NVRAM_DATA_CAP_DEVPATHS); } static const char * bhnd_nvram_tlv_next(struct bhnd_nvram_data *nv, void **cookiep) { struct bhnd_nvram_tlv *tlv; struct bhnd_nvram_tlv_env *env; size_t io_offset; tlv = (struct bhnd_nvram_tlv *)nv; /* Find next readable TLV record */ if (*cookiep == NULL) { /* Start search at offset 0x0 */ io_offset = 0x0; env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep); } else { /* Seek past the previous env record */ io_offset = bhnd_nvram_tlv_to_offset(tlv, *cookiep); env = bhnd_nvram_tlv_next_env(tlv, &io_offset, NULL); if (env == NULL) BHND_NV_PANIC("invalid cookiep; record missing"); /* Advance to next env record, update the caller's cookiep */ env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep); } /* Check for EOF */ if (env == NULL) return (NULL); /* Return the NUL terminated name */ return (env->envp); } static void * bhnd_nvram_tlv_find(struct bhnd_nvram_data *nv, const char *name) { return (bhnd_nvram_data_generic_find(nv, name)); } static int bhnd_nvram_tlv_getvar_order(struct bhnd_nvram_data *nv, void *cookiep1, void *cookiep2) { if (cookiep1 < cookiep2) return (-1); if (cookiep1 > cookiep2) return (1); return (0); } static int bhnd_nvram_tlv_getvar(struct bhnd_nvram_data *nv, void *cookiep, void *buf, size_t *len, bhnd_nvram_type type) { return (bhnd_nvram_data_generic_rp_getvar(nv, cookiep, buf, len, type)); } static int bhnd_nvram_tlv_copy_val(struct bhnd_nvram_data *nv, void *cookiep, bhnd_nvram_val **value) { return (bhnd_nvram_data_generic_rp_copy_val(nv, cookiep, value)); } static const void * bhnd_nvram_tlv_getvar_ptr(struct bhnd_nvram_data *nv, void *cookiep, size_t *len, bhnd_nvram_type *type) { struct bhnd_nvram_tlv *tlv; struct bhnd_nvram_tlv_env *env; const char *val; int error; tlv = (struct bhnd_nvram_tlv *)nv; /* Fetch pointer to the TLV_ENV record */ if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL) BHND_NV_PANIC("invalid cookiep: %p", cookiep); /* Parse value pointer and length from key\0value data */ error = bhnd_nvram_parse_env(env->envp, NVRAM_TLV_ENVP_DATA_LEN(env), '\0', NULL, NULL, &val, len); if (error) BHND_NV_PANIC("unexpected error parsing '%s'", env->envp); /* Type is always CSTR */ *type = BHND_NVRAM_TYPE_STRING; return (val); } static const char * bhnd_nvram_tlv_getvar_name(struct bhnd_nvram_data *nv, void *cookiep) { struct bhnd_nvram_tlv *tlv; const struct bhnd_nvram_tlv_env *env; tlv = (struct bhnd_nvram_tlv *)nv; /* Fetch pointer to the TLV_ENV record */ if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL) BHND_NV_PANIC("invalid cookiep: %p", cookiep); /* Return name pointer */ return (&env->envp[0]); } static int bhnd_nvram_tlv_filter_setvar(struct bhnd_nvram_data *nv, const char *name, bhnd_nvram_val *value, bhnd_nvram_val **result) { bhnd_nvram_val *str; const char *inp; bhnd_nvram_type itype; size_t ilen; size_t name_len, tlv_nremain; int error; tlv_nremain = NVRAM_TLV_ENVP_DATA_MAX_LEN; /* Name (trimmed of any path prefix) must be valid */ if (!bhnd_nvram_validate_name(bhnd_nvram_trim_path_name(name))) return (EINVAL); /* 'name=' must fit within the maximum TLV_ENV record length */ name_len = strlen(name) + 1; /* '=' */ if (tlv_nremain < name_len) { BHND_NV_LOG("'%s=' exceeds maximum TLV_ENV record length\n", name); return (EINVAL); } tlv_nremain -= name_len; /* Convert value to a (bcm-formatted) string */ error = bhnd_nvram_val_convert_new(&str, &bhnd_nvram_val_bcm_string_fmt, value, BHND_NVRAM_VAL_DYNAMIC); if (error) return (error); /* The string value must fit within remaining TLV_ENV record length */ inp = bhnd_nvram_val_bytes(str, &ilen, &itype); if (tlv_nremain < ilen) { BHND_NV_LOG("'%.*s\\0' exceeds maximum TLV_ENV record length\n", BHND_NV_PRINT_WIDTH(ilen), inp); bhnd_nvram_val_release(str); return (EINVAL); } tlv_nremain -= name_len; /* Success. Transfer result ownership to the caller. */ *result = str; return (0); } static int bhnd_nvram_tlv_filter_unsetvar(struct bhnd_nvram_data *nv, const char *name) { /* We permit deletion of any variable */ return (0); } /** * Iterate over the records starting at @p next, returning the parsed * record's @p tag, @p size, and @p offset. * * @param io The I/O context to parse. * @param[in,out] next The next offset to be parsed, or 0x0 * to begin parsing. Upon successful * return, will be set to the offset of the * next record (or EOF, if * NVRAM_TLV_TYPE_END was parsed). * @param[out] offset The record's value offset. * @param[out] tag The record's tag. * * @retval 0 success * @retval EINVAL if parsing @p io as TLV fails. * @retval non-zero if reading @p io otherwise fails, a regular unix error * code will be returned. */ static int bhnd_nvram_tlv_next_record(struct bhnd_nvram_io *io, size_t *next, size_t *offset, uint8_t *tag) { size_t io_offset, io_size; uint16_t parsed_len; uint8_t len_hdr[2]; int error; io_offset = *next; io_size = bhnd_nvram_io_getsize(io); /* Save the record offset */ if (offset != NULL) *offset = io_offset; /* Fetch initial tag */ error = bhnd_nvram_io_read(io, io_offset, tag, sizeof(*tag)); if (error) return (error); io_offset++; /* EOF */ if (*tag == NVRAM_TLV_TYPE_END) { *next = io_offset; return (0); } /* Read length field */ if (*tag & NVRAM_TLV_TF_U8_LEN) { error = bhnd_nvram_io_read(io, io_offset, &len_hdr, sizeof(len_hdr[0])); if (error) { BHND_NV_LOG("error reading TLV record size: %d\n", error); return (error); } parsed_len = len_hdr[0]; io_offset++; } else { error = bhnd_nvram_io_read(io, io_offset, &len_hdr, sizeof(len_hdr)); if (error) { BHND_NV_LOG("error reading 16-bit TLV record " "size: %d\n", error); return (error); } parsed_len = (len_hdr[0] << 8) | len_hdr[1]; io_offset += 2; } /* Advance to next record */ if (parsed_len > io_size || io_size - parsed_len < io_offset) { /* Hit early EOF */ BHND_NV_LOG("TLV record length %hu truncated by input " "size of %zu\n", parsed_len, io_size); return (EINVAL); } *next = io_offset + parsed_len; /* Valid record found */ return (0); } /** * Parse the TLV data in @p io to determine the total size of the TLV * data mapped by @p io (which may be less than the size of @p io). */ static int bhnd_nvram_tlv_parse_size(struct bhnd_nvram_io *io, size_t *size) { size_t next; uint8_t tag; int error; /* We have to perform a minimal parse to determine the actual length */ next = 0x0; *size = 0x0; /* Iterate over the input until we hit END tag or the read fails */ do { error = bhnd_nvram_tlv_next_record(io, &next, NULL, &tag); if (error) return (error); } while (tag != NVRAM_TLV_TYPE_END); /* Offset should now point to EOF */ BHND_NV_ASSERT(next <= bhnd_nvram_io_getsize(io), ("parse returned invalid EOF offset")); *size = next; return (0); } /** * Iterate over the records in @p tlv, returning a pointer to the next * NVRAM_TLV_TYPE_ENV record, or NULL if EOF is reached. * * @param tlv The TLV instance. * @param[in,out] next The next offset to be parsed, or 0x0 * to begin parsing. Upon successful * return, will be set to the offset of the * next record. */ static struct bhnd_nvram_tlv_env * bhnd_nvram_tlv_next_env(struct bhnd_nvram_tlv *tlv, size_t *next, void **cookiep) { uint8_t tag; int error; /* Find the next TLV_ENV record, starting at @p next */ do { void *c; size_t offset; /* Fetch the next TLV record */ error = bhnd_nvram_tlv_next_record(tlv->data, next, &offset, &tag); if (error) { BHND_NV_LOG("unexpected error in next_record(): %d\n", error); return (NULL); } /* Only interested in ENV records */ if (tag != NVRAM_TLV_TYPE_ENV) continue; /* Map and return TLV_ENV record pointer */ c = bhnd_nvram_tlv_to_cookie(tlv, offset); /* Provide the cookiep value for the returned record */ if (cookiep != NULL) *cookiep = c; return (bhnd_nvram_tlv_get_env(tlv, c)); } while (tag != NVRAM_TLV_TYPE_END); /* No remaining ENV records */ return (NULL); } /** * Return a pointer to the TLV_ENV record for @p cookiep, or NULL * if none vailable. */ static struct bhnd_nvram_tlv_env * bhnd_nvram_tlv_get_env(struct bhnd_nvram_tlv *tlv, void *cookiep) { struct bhnd_nvram_tlv_env *env; void *ptr; size_t navail; size_t io_offset, io_size; int error; io_size = bhnd_nvram_io_getsize(tlv->data); io_offset = bhnd_nvram_tlv_to_offset(tlv, cookiep); /* At EOF? */ if (io_offset == io_size) return (NULL); /* Fetch non-const pointer to the record entry */ error = bhnd_nvram_io_write_ptr(tlv->data, io_offset, &ptr, sizeof(env->hdr), &navail); if (error) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("error mapping record for cookiep: %d\n", error); return (NULL); } /* Validate the record pointer */ env = ptr; if (env->hdr.tag != NVRAM_TLV_TYPE_ENV) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("non-ENV record mapped for %p\n", cookiep); return (NULL); } /* Is the required variable name data is mapped? */ if (navail < sizeof(struct bhnd_nvram_tlv_env_hdr) + env->hdr.size || env->hdr.size == sizeof(env->flags)) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("TLV_ENV variable data not mapped for %p\n", cookiep); return (NULL); } return (env); } /** * Return a cookiep for the given I/O offset. */ static void * bhnd_nvram_tlv_to_cookie(struct bhnd_nvram_tlv *tlv, size_t io_offset) { const void *ptr; int error; BHND_NV_ASSERT(io_offset < bhnd_nvram_io_getsize(tlv->data), ("io_offset %zu out-of-range", io_offset)); BHND_NV_ASSERT(io_offset < UINTPTR_MAX, ("io_offset %#zx exceeds UINTPTR_MAX", io_offset)); error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_offset, NULL); if (error) BHND_NV_PANIC("error mapping offset %zu: %d", io_offset, error); ptr = (const uint8_t *)ptr + io_offset; return (__DECONST(void *, ptr)); } /* Convert a cookiep back to an I/O offset */ static size_t bhnd_nvram_tlv_to_offset(struct bhnd_nvram_tlv *tlv, void *cookiep) { const void *ptr; intptr_t offset; size_t io_size; int error; BHND_NV_ASSERT(cookiep != NULL, ("null cookiep")); io_size = bhnd_nvram_io_getsize(tlv->data); error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_size, NULL); if (error) BHND_NV_PANIC("error mapping offset %zu: %d", io_size, error); offset = (const uint8_t *)cookiep - (const uint8_t *)ptr; BHND_NV_ASSERT(offset >= 0, ("invalid cookiep")); BHND_NV_ASSERT((uintptr_t)offset < SIZE_MAX, ("cookiep > SIZE_MAX)")); BHND_NV_ASSERT((uintptr_t)offset <= io_size, ("cookiep > io_size)")); return ((size_t)offset); } Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c =================================================================== --- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c (revision 350434) +++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c (revision 350435) @@ -1,1268 +1,1269 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include +#include #include #ifdef _KERNEL #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_datavar.h" #include "bhnd_nvram_storevar.h" /* * BHND NVRAM Store * * Manages in-memory and persistent representations of NVRAM data. */ static int bhnd_nvstore_parse_data( struct bhnd_nvram_store *sc); static int bhnd_nvstore_parse_path_entries( struct bhnd_nvram_store *sc); static int bhnd_nvram_store_export_child( struct bhnd_nvram_store *sc, bhnd_nvstore_path *top, bhnd_nvstore_path *child, bhnd_nvram_plist *plist, uint32_t flags); static int bhnd_nvstore_export_merge( struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, bhnd_nvram_plist *merged, uint32_t flags); static int bhnd_nvstore_export_devpath_alias( struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, const char *devpath, bhnd_nvram_plist *plist, u_long *alias_val); /** * Allocate and initialize a new NVRAM data store instance. * * The caller is responsible for deallocating the instance via * bhnd_nvram_store_free(). * * @param[out] store On success, a pointer to the newly allocated NVRAM data * instance. * @param data The NVRAM data to be managed by the returned NVRAM data store * instance. * * @retval 0 success * @retval non-zero if an error occurs during allocation or initialization, a * regular unix error code will be returned. */ int bhnd_nvram_store_new(struct bhnd_nvram_store **store, struct bhnd_nvram_data *data) { struct bhnd_nvram_store *sc; int error; /* Allocate new instance */ sc = bhnd_nv_calloc(1, sizeof(*sc)); if (sc == NULL) return (ENOMEM); BHND_NVSTORE_LOCK_INIT(sc); BHND_NVSTORE_LOCK(sc); /* Initialize path hash table */ sc->num_paths = 0; for (size_t i = 0; i < nitems(sc->paths); i++) LIST_INIT(&sc->paths[i]); /* Initialize alias hash table */ sc->num_aliases = 0; for (size_t i = 0; i < nitems(sc->aliases); i++) LIST_INIT(&sc->aliases[i]); /* Retain the NVRAM data */ sc->data = bhnd_nvram_data_retain(data); sc->data_caps = bhnd_nvram_data_caps(data); sc->data_opts = bhnd_nvram_data_options(data); if (sc->data_opts != NULL) { bhnd_nvram_plist_retain(sc->data_opts); } else { sc->data_opts = bhnd_nvram_plist_new(); if (sc->data_opts == NULL) { error = ENOMEM; goto cleanup; } } /* Register required root path */ error = bhnd_nvstore_register_path(sc, BHND_NVSTORE_ROOT_PATH, BHND_NVSTORE_ROOT_PATH_LEN); if (error) goto cleanup; sc->root_path = bhnd_nvstore_get_path(sc, BHND_NVSTORE_ROOT_PATH, BHND_NVSTORE_ROOT_PATH_LEN); BHND_NV_ASSERT(sc->root_path, ("missing root path")); /* Parse all variables vended by our backing NVRAM data instance, * generating all path entries, alias entries, and variable indexes */ if ((error = bhnd_nvstore_parse_data(sc))) goto cleanup; *store = sc; BHND_NVSTORE_UNLOCK(sc); return (0); cleanup: BHND_NVSTORE_UNLOCK(sc); bhnd_nvram_store_free(sc); return (error); } /** * Allocate and initialize a new NVRAM data store instance, parsing the * NVRAM data from @p io. * * The caller is responsible for deallocating the instance via * bhnd_nvram_store_free(). * * The NVRAM data mapped by @p io will be copied, and @p io may be safely * deallocated after bhnd_nvram_store_new() returns. * * @param[out] store On success, a pointer to the newly allocated NVRAM data * instance. * @param io An I/O context mapping the NVRAM data to be copied and parsed. * @param cls The NVRAM data class to be used when parsing @p io, or NULL * to perform runtime identification of the appropriate data class. * * @retval 0 success * @retval non-zero if an error occurs during allocation or initialization, a * regular unix error code will be returned. */ int bhnd_nvram_store_parse_new(struct bhnd_nvram_store **store, struct bhnd_nvram_io *io, bhnd_nvram_data_class *cls) { struct bhnd_nvram_data *data; int error; /* Try to parse the data */ if ((error = bhnd_nvram_data_new(cls, &data, io))) return (error); /* Try to create our new store instance */ error = bhnd_nvram_store_new(store, data); bhnd_nvram_data_release(data); return (error); } /** * Free an NVRAM store instance, releasing all associated resources. * * @param sc A store instance previously allocated via * bhnd_nvram_store_new(). */ void bhnd_nvram_store_free(struct bhnd_nvram_store *sc) { /* Clean up alias hash table */ for (size_t i = 0; i < nitems(sc->aliases); i++) { bhnd_nvstore_alias *alias, *anext; LIST_FOREACH_SAFE(alias, &sc->aliases[i], na_link, anext) bhnd_nv_free(alias); } /* Clean up path hash table */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path, *pnext; LIST_FOREACH_SAFE(path, &sc->paths[i], np_link, pnext) bhnd_nvstore_path_free(path); } if (sc->data != NULL) bhnd_nvram_data_release(sc->data); if (sc->data_opts != NULL) bhnd_nvram_plist_release(sc->data_opts); BHND_NVSTORE_LOCK_DESTROY(sc); bhnd_nv_free(sc); } /** * Parse all variables vended by our backing NVRAM data instance, * generating all path entries, alias entries, and variable indexes. * * @param sc The NVRAM store instance to be initialized with * paths, aliases, and data parsed from its backing * data. * * @retval 0 success * @retval non-zero if an error occurs during parsing, a regular unix error * code will be returned. */ static int bhnd_nvstore_parse_data(struct bhnd_nvram_store *sc) { const char *name; void *cookiep; int error; /* Parse and register all device paths and path aliases. This enables * resolution of _forward_ references to device paths aliases when * scanning variable entries below */ if ((error = bhnd_nvstore_parse_path_entries(sc))) return (error); /* Calculate the per-path variable counts, and report dangling alias * references as an error. */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_path *path; bhnd_nvstore_name_info info; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); switch (info.type) { case BHND_NVSTORE_VAR: /* Fetch referenced path */ path = bhnd_nvstore_var_get_path(sc, &info); if (path == NULL) { BHND_NV_LOG("variable '%s' has dangling " "path reference\n", name); return (EFTYPE); } /* Increment path variable count */ if (path->num_vars == SIZE_MAX) { BHND_NV_LOG("more than SIZE_MAX variables in " "path %s\n", path->path_str); return (EFTYPE); } path->num_vars++; break; case BHND_NVSTORE_ALIAS_DECL: /* Skip -- path alias already parsed and recorded */ break; } } /* If the backing NVRAM data instance vends only a single root ("/") * path, we may be able to skip generating an index for the root * path */ if (sc->num_paths == 1) { bhnd_nvstore_path *path; /* If the backing instance provides its own name-based lookup * indexing, we can skip generating a duplicate here */ if (sc->data_caps & BHND_NVRAM_DATA_CAP_INDEXED) return (0); /* If the sole root path contains fewer variables than the * minimum indexing threshhold, we do not need to generate an * index */ path = bhnd_nvstore_get_root_path(sc); if (path->num_vars < BHND_NV_IDX_VAR_THRESHOLD) return (0); } /* Allocate per-path index instances */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path; LIST_FOREACH(path, &sc->paths[i], np_link) { path->index = bhnd_nvstore_index_new(path->num_vars); if (path->index == NULL) return (ENOMEM); } } /* Populate per-path indexes */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_name_info info; bhnd_nvstore_path *path; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); switch (info.type) { case BHND_NVSTORE_VAR: /* Fetch referenced path */ path = bhnd_nvstore_var_get_path(sc, &info); BHND_NV_ASSERT(path != NULL, ("dangling path reference")); /* Append to index */ error = bhnd_nvstore_index_append(sc, path->index, cookiep); if (error) return (error); break; case BHND_NVSTORE_ALIAS_DECL: /* Skip */ break; } } /* Prepare indexes for querying */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path; LIST_FOREACH(path, &sc->paths[i], np_link) { error = bhnd_nvstore_index_prepare(sc, path->index); if (error) return (error); } } return (0); } /** * Parse and register path and path alias entries for all declarations found in * the NVRAM data backing @p nvram. * * @param sc The NVRAM store instance. * * @retval 0 success * @retval non-zero If parsing fails, a regular unix error code will be * returned. */ static int bhnd_nvstore_parse_path_entries(struct bhnd_nvram_store *sc) { const char *name; void *cookiep; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); /* Skip path registration if the data source does not support device * paths. */ if (!(sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS)) { BHND_NV_ASSERT(sc->root_path != NULL, ("missing root path")); return (0); } /* Otherwise, parse and register all paths and path aliases */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_name_info info; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); /* Register the path */ error = bhnd_nvstore_var_register_path(sc, &info, cookiep); if (error) { BHND_NV_LOG("failed to register path for %s: %d\n", name, error); return (error); } } return (0); } /** * Merge exported per-path variables (uncommitted, committed, or both) into * the empty @p merged property list. * * @param sc The NVRAM store instance. * @param path The NVRAM path to be exported. * @param merged The property list to populate with the merged results. * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvstore_export_merge(struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, bhnd_nvram_plist *merged, uint32_t flags) { void *cookiep, *idxp; int error; /* Populate merged list with all pending variables */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { bhnd_nvram_prop *prop; prop = NULL; while ((prop = bhnd_nvram_plist_next(path->pending, prop))) { /* Skip variables marked for deletion */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED)) { if (bhnd_nvram_prop_is_null(prop)) continue; } /* Append to merged list */ error = bhnd_nvram_plist_append(merged, prop); if (error) return (error); } } /* Skip merging committed variables? */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED)) return (0); /* Merge in the committed NVRAM variables */ idxp = NULL; while ((cookiep = bhnd_nvstore_path_data_next(sc, path, &idxp))) { const char *name; bhnd_nvram_val *val; /* Fetch the variable name */ name = bhnd_nvram_data_getvar_name(sc->data, cookiep); /* Trim device path prefix */ if (sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS) name = bhnd_nvram_trim_path_name(name); /* Skip if already defined in pending updates */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { if (bhnd_nvram_plist_contains(path->pending, name)) continue; } /* Skip if higher precedence value was already defined. This * may occur if the underlying data store contains duplicate * keys; iteration will always return the definition with * the highest precedence first */ if (bhnd_nvram_plist_contains(merged, name)) continue; /* Fetch the variable's value representation */ if ((error = bhnd_nvram_data_copy_val(sc->data, cookiep, &val))) return (error); /* Add to path variable list */ error = bhnd_nvram_plist_append_val(merged, name, val); bhnd_nvram_val_release(val); if (error) return (error); } return (0); } /** * Find a free alias value for @p path, and append the devpathXX alias * declaration to @p plist. * * @param sc The NVRAM store instance. * @param path The NVRAM path for which a devpath alias * variable should be produced. * @param devpath The devpathXX path value for @p path. * @param plist The property list to which @p path's devpath * variable will be appended. * @param[out] alias_val On success, will be set to the alias value * allocated for @p path. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvstore_export_devpath_alias(struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, const char *devpath, bhnd_nvram_plist *plist, u_long *alias_val) { bhnd_nvstore_alias *alias; char *pathvar; int error; *alias_val = 0; /* Prefer alias value already reserved for this path. */ alias = bhnd_nvstore_find_alias(sc, path->path_str); if (alias != NULL) { *alias_val = alias->alias; /* Allocate devpathXX variable name */ bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val); if (pathvar == NULL) return (ENOMEM); /* Append alias variable to property list */ error = bhnd_nvram_plist_append_string(plist, pathvar, devpath); BHND_NV_ASSERT(error != EEXIST, ("reserved alias %lu:%s in use", * alias_val, path->path_str)); bhnd_nv_free(pathvar); return (error); } /* Find the next free devpathXX alias entry */ while (1) { /* Skip existing reserved alias values */ while (bhnd_nvstore_get_alias(sc, *alias_val) != NULL) { if (*alias_val == ULONG_MAX) return (ENOMEM); (*alias_val)++; } /* Allocate devpathXX variable name */ bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val); if (pathvar == NULL) return (ENOMEM); /* If not in-use, we can terminate the search */ if (!bhnd_nvram_plist_contains(plist, pathvar)) break; /* Keep searching */ bhnd_nv_free(pathvar); if (*alias_val == ULONG_MAX) return (ENOMEM); (*alias_val)++; } /* Append alias variable to property list */ error = bhnd_nvram_plist_append_string(plist, pathvar, devpath); bhnd_nv_free(pathvar); return (error); } /** * Export a single @p child path's properties, appending the result to @p plist. * * @param sc The NVRAM store instance. * @param top The root NVRAM path being exported. * @param child The NVRAM path to be exported. * @param plist The property list to which @p child's exported * properties should be appended. * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvram_store_export_child(struct bhnd_nvram_store *sc, bhnd_nvstore_path *top, bhnd_nvstore_path *child, bhnd_nvram_plist *plist, uint32_t flags) { bhnd_nvram_plist *path_vars; bhnd_nvram_prop *prop; const char *relpath; char *prefix, *namebuf; size_t prefix_len, relpath_len; size_t namebuf_size, num_props; bool emit_compact_devpath; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); prefix = NULL; num_props = 0; path_vars = NULL; namebuf = NULL; /* Determine the path relative to the top-level path */ relpath = bhnd_nvstore_parse_relpath(top->path_str, child->path_str); if (relpath == NULL) { /* Skip -- not a child of the root path */ return (0); } relpath_len = strlen(relpath); /* Skip sub-path if export of children was not requested, */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_CHILDREN) && relpath_len > 0) return (0); /* Collect all variables to be included in the export */ if ((path_vars = bhnd_nvram_plist_new()) == NULL) return (ENOMEM); if ((error = bhnd_nvstore_export_merge(sc, child, path_vars, flags))) { bhnd_nvram_plist_release(path_vars); return (error); } /* Skip if no children are to be exported */ if (bhnd_nvram_plist_count(path_vars) == 0) { bhnd_nvram_plist_release(path_vars); return (0); } /* Determine appropriate device path encoding */ emit_compact_devpath = false; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS)) { /* Re-encode as compact (if non-empty path) */ if (relpath_len > 0) emit_compact_devpath = true; } else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) { /* Re-encode with fully expanded device path */ emit_compact_devpath = false; } else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) { /* Preserve existing encoding of this path */ if (bhnd_nvstore_find_alias(sc, child->path_str) != NULL) emit_compact_devpath = true; } else { BHND_NV_LOG("invalid device path flag: %#" PRIx32, flags); error = EINVAL; goto finished; } /* Allocate variable device path prefix to use for all property names, * and if using compact encoding, emit the devpathXX= variable */ prefix = NULL; prefix_len = 0; if (emit_compact_devpath) { u_long alias_val; int len; /* Reserve an alias value and append the devpathXX= variable to * the property list */ error = bhnd_nvstore_export_devpath_alias(sc, child, relpath, plist, &alias_val); if (error) goto finished; /* Allocate variable name prefix */ len = bhnd_nv_asprintf(&prefix, "%lu:", alias_val); if (prefix == NULL) { error = ENOMEM; goto finished; } prefix_len = len; } else if (relpath_len > 0) { int len; /* Allocate the variable name prefix, appending '/' to the * relative path */ len = bhnd_nv_asprintf(&prefix, "%s/", relpath); if (prefix == NULL) { error = ENOMEM; goto finished; } prefix_len = len; } /* If prefixing of variable names is required, allocate a name * formatting buffer */ namebuf_size = 0; if (prefix != NULL) { size_t maxlen; /* Find the maximum name length */ maxlen = 0; prop = NULL; while ((prop = bhnd_nvram_plist_next(path_vars, prop))) { const char *name; name = bhnd_nvram_prop_name(prop); maxlen = bhnd_nv_ummax(strlen(name), maxlen); } /* Allocate name buffer (path-prefix + name + '\0') */ namebuf_size = prefix_len + maxlen + 1; namebuf = bhnd_nv_malloc(namebuf_size); if (namebuf == NULL) { error = ENOMEM; goto finished; } } /* Append all path variables to the export plist, prepending the * device-path prefix to the variable names, if required */ prop = NULL; while ((prop = bhnd_nvram_plist_next(path_vars, prop)) != NULL) { const char *name; /* Prepend device prefix to the variable name */ name = bhnd_nvram_prop_name(prop); if (prefix != NULL) { int len; /* * Write prefixed variable name to our name buffer. * * We precalcuate the size when scanning all names * above, so this should always succeed. */ len = snprintf(namebuf, namebuf_size, "%s%s", prefix, name); if (len < 0 || (size_t)len >= namebuf_size) BHND_NV_PANIC("invalid max_name_len"); name = namebuf; } /* Add property to export plist */ error = bhnd_nvram_plist_append_val(plist, name, bhnd_nvram_prop_val(prop)); if (error) goto finished; } /* Success */ error = 0; finished: if (prefix != NULL) bhnd_nv_free(prefix); if (namebuf != NULL) bhnd_nv_free(namebuf); if (path_vars != NULL) bhnd_nvram_plist_release(path_vars); return (error); } /** * Export a flat, ordered NVRAM property list representation of all NVRAM * properties at @p path. * * @param sc The NVRAM store instance. * @param path The NVRAM path to export, or NULL to select the root * path. * @param[out] cls On success, will be set to the backing data class * of @p sc. If the data class is are not desired, * a NULL pointer may be provided. * @param[out] props On success, will be set to a caller-owned property * list containing the exported properties. The caller is * responsible for releasing this value via * bhnd_nvram_plist_release(). * @param[out] options On success, will be set to a caller-owned property * list containing the current NVRAM serialization options * for @p sc. The caller is responsible for releasing this * value via bhnd_nvram_plist_release(). * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval EINVAL If @p flags is invalid. * @retval ENOENT The requested path was not found. * @retval ENOMEM If allocation fails. * @retval non-zero If export of @p path otherwise fails, a regular unix * error code will be returned. */ int bhnd_nvram_store_export(struct bhnd_nvram_store *sc, const char *path, bhnd_nvram_data_class **cls, bhnd_nvram_plist **props, bhnd_nvram_plist **options, uint32_t flags) { bhnd_nvram_plist *unordered; bhnd_nvstore_path *top; bhnd_nvram_prop *prop; const char *name; void *cookiep; size_t num_dpath_flags; int error; *props = NULL; unordered = NULL; num_dpath_flags = 0; if (options != NULL) *options = NULL; /* Default to exporting root path */ if (path == NULL) path = BHND_NVSTORE_ROOT_PATH; /* Default to exporting all properties */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { flags |= BHND_NVSTORE_EXPORT_ALL_VARS; } /* Default to preserving the current device path encoding */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) { flags |= BHND_NVSTORE_EXPORT_PRESERVE_DEVPATHS; } /* Exactly one device path encoding flag must be set */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS)) num_dpath_flags++; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) num_dpath_flags++; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) num_dpath_flags++; if (num_dpath_flags != 1) return (EINVAL); /* If EXPORT_DELETED is set, EXPORT_UNCOMMITTED must be set too */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED)) { return (EINVAL); } /* Lock internal state before querying paths/properties */ BHND_NVSTORE_LOCK(sc); /* Fetch referenced path */ top = bhnd_nvstore_get_path(sc, path, strlen(path)); if (top == NULL) { error = ENOENT; goto failed; } /* Allocate new, empty property list */ if ((unordered = bhnd_nvram_plist_new()) == NULL) { error = ENOMEM; goto failed; } /* Export the top-level path first */ error = bhnd_nvram_store_export_child(sc, top, top, unordered, flags); if (error) goto failed; /* Attempt to export any children of the root path */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *child; LIST_FOREACH(child, &sc->paths[i], np_link) { /* Top-level path was already exported */ if (child == top) continue; error = bhnd_nvram_store_export_child(sc, top, child, unordered, flags); if (error) goto failed; } } /* If requested, provide the current class and serialization options */ if (cls != NULL) *cls = bhnd_nvram_data_get_class(sc->data); if (options != NULL) *options = bhnd_nvram_plist_retain(sc->data_opts); /* * If we're re-encoding device paths, don't bother preserving the * existing NVRAM variable order; our variable names will not match * the existing backing NVRAM data. */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) { *props = unordered; unordered = NULL; goto finished; } /* * Re-order the flattened output to match the existing NVRAM variable * ordering. * * We append all new variables at the end of the input; this should * reduce the delta that needs to be written (e.g. to flash) when * committing NVRAM updates, and should result in a serialization * identical to the input serialization if uncommitted updates are * excluded from the export. */ if ((*props = bhnd_nvram_plist_new()) == NULL) { error = ENOMEM; goto failed; } /* Using the backing NVRAM data ordering to order all variables * currently defined in the backing store */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { prop = bhnd_nvram_plist_get_prop(unordered, name); if (prop == NULL) continue; /* Append to ordered result */ if ((error = bhnd_nvram_plist_append(*props, prop))) goto failed; /* Remove from unordered list */ bhnd_nvram_plist_remove(unordered, name); } /* Any remaining variables are new, and should be appended to the * end of the export list */ prop = NULL; while ((prop = bhnd_nvram_plist_next(unordered, prop)) != NULL) { if ((error = bhnd_nvram_plist_append(*props, prop))) goto failed; } /* Export complete */ finished: BHND_NVSTORE_UNLOCK(sc); if (unordered != NULL) bhnd_nvram_plist_release(unordered); return (0); failed: BHND_NVSTORE_UNLOCK(sc); if (unordered != NULL) bhnd_nvram_plist_release(unordered); if (options != NULL && *options != NULL) bhnd_nvram_plist_release(*options); if (*props != NULL) bhnd_nvram_plist_release(*props); return (error); } /** * Encode all NVRAM properties at @p path, using the @p store's current NVRAM * data format. * * @param sc The NVRAM store instance. * @param path The NVRAM path to export, or NULL to select the root * path. * @param[out] data On success, will be set to the newly serialized value. * The caller is responsible for freeing this value * via bhnd_nvram_io_free(). * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval EINVAL If @p flags is invalid. * @retval ENOENT The requested path was not found. * @retval ENOMEM If allocation fails. * @retval non-zero If serialization of @p path otherwise fails, a regular * unix error code will be returned. */ int bhnd_nvram_store_serialize(struct bhnd_nvram_store *sc, const char *path, struct bhnd_nvram_io **data, uint32_t flags) { bhnd_nvram_plist *props; bhnd_nvram_plist *options; bhnd_nvram_data_class *cls; struct bhnd_nvram_io *io; void *outp; size_t olen; int error; props = NULL; options = NULL; io = NULL; /* Perform requested export */ error = bhnd_nvram_store_export(sc, path, &cls, &props, &options, flags); if (error) return (error); /* Determine serialized size */ error = bhnd_nvram_data_serialize(cls, props, options, NULL, &olen); if (error) goto failed; /* Allocate output buffer */ if ((io = bhnd_nvram_iobuf_empty(olen, olen)) == NULL) { error = ENOMEM; goto failed; } /* Fetch write pointer */ if ((error = bhnd_nvram_io_write_ptr(io, 0, &outp, olen, NULL))) goto failed; /* Perform serialization */ error = bhnd_nvram_data_serialize(cls, props, options, outp, &olen); if (error) goto failed; if ((error = bhnd_nvram_io_setsize(io, olen))) goto failed; /* Success */ bhnd_nvram_plist_release(props); bhnd_nvram_plist_release(options); *data = io; return (0); failed: if (props != NULL) bhnd_nvram_plist_release(props); if (options != NULL) bhnd_nvram_plist_release(options); if (io != NULL) bhnd_nvram_io_free(io); return (error); } /** * Read an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param[out] outp On success, the requested value will be written * to this buffer. This argment may be NULL if * the value is not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The requested data type to be written to * @p outp. * * @retval 0 success * @retval ENOENT The requested variable was not found. * @retval ENOMEM If @p outp is non-NULL and a buffer of @p olen is too * small to hold the requested value. * @retval non-zero If reading @p name otherwise fails, a regular unix * error code will be returned. */ int bhnd_nvram_store_getvar(struct bhnd_nvram_store *sc, const char *name, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvstore_name_info info; bhnd_nvstore_path *path; bhnd_nvram_prop *prop; void *cookiep; int error; BHND_NVSTORE_LOCK(sc); /* Parse the variable name */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL, sc->data_caps, &info); if (error) goto finished; /* Fetch the variable's enclosing path entry */ if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL) { error = ENOENT; goto finished; } /* Search uncommitted updates first */ prop = bhnd_nvstore_path_get_update(sc, path, info.name); if (prop != NULL) { if (bhnd_nvram_prop_is_null(prop)) { /* NULL denotes a pending deletion */ error = ENOENT; } else { error = bhnd_nvram_prop_encode(prop, outp, olen, otype); } goto finished; } /* Search the backing NVRAM data */ cookiep = bhnd_nvstore_path_data_lookup(sc, path, info.name); if (cookiep != NULL) { /* Found in backing store */ error = bhnd_nvram_data_getvar(sc->data, cookiep, outp, olen, otype); goto finished; } /* Not found */ error = ENOENT; finished: BHND_NVSTORE_UNLOCK(sc); return (error); } /** * Common bhnd_nvram_store_set*() and bhnd_nvram_store_unsetvar() * implementation. * * If @p value is NULL, the variable will be marked for deletion. */ static int bhnd_nvram_store_setval_common(struct bhnd_nvram_store *sc, const char *name, bhnd_nvram_val *value) { bhnd_nvstore_path *path; bhnd_nvstore_name_info info; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); /* Parse the variable name */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL, sc->data_caps, &info); if (error) return (error); /* Fetch the variable's enclosing path entry */ if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL) return (error); /* Register the update entry */ return (bhnd_nvstore_path_register_update(sc, path, info.name, value)); } /** * Set an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param value The new value. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If @p value is invalid. */ int bhnd_nvram_store_setval(struct bhnd_nvram_store *sc, const char *name, bhnd_nvram_val *value) { int error; BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, value); BHND_NVSTORE_UNLOCK(sc); return (error); } /** * Set an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param[out] inp The new value. * @param[in,out] ilen The size of @p inp. * @param itype The data type of @p inp. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If the new value is invalid. * @retval EINVAL If @p name is read-only. */ int bhnd_nvram_store_setvar(struct bhnd_nvram_store *sc, const char *name, const void *inp, size_t ilen, bhnd_nvram_type itype) { bhnd_nvram_val val; int error; error = bhnd_nvram_val_init(&val, NULL, inp, ilen, itype, BHND_NVRAM_VAL_FIXED|BHND_NVRAM_VAL_BORROW_DATA); if (error) { BHND_NV_LOG("error initializing value: %d\n", error); return (EINVAL); } BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, &val); BHND_NVSTORE_UNLOCK(sc); bhnd_nvram_val_release(&val); return (error); } /** * Unset an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If @p name is read-only. */ int bhnd_nvram_store_unsetvar(struct bhnd_nvram_store *sc, const char *name) { int error; BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, BHND_NVRAM_VAL_NULL); BHND_NVSTORE_UNLOCK(sc); return (error); } Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c =================================================================== --- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c (revision 350434) +++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c (revision 350435) @@ -1,1936 +1,1937 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #ifdef _KERNEL #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_valuevar.h" static int bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt, const void *inp, size_t ilen, bhnd_nvram_type itype); static void *bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen, bhnd_nvram_type itype, uint32_t flags); static int bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags); static int bhnd_nvram_val_set_inline(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype); static int bhnd_nvram_val_encode_data(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_int(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_null(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_bool(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_string(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); /** Initialize an empty value instance with @p _fmt, @p _storage, and * an implicit callee-owned reference */ #define BHND_NVRAM_VAL_INITIALIZER(_fmt, _storage) \ (bhnd_nvram_val) { \ .refs = 1, \ .val_storage = _storage, \ .fmt = _fmt, \ .data_storage = BHND_NVRAM_VAL_DATA_NONE, \ }; /** Assert that @p value's backing representation state has initialized * as empty. */ #define BHND_NVRAM_VAL_ASSERT_EMPTY(_value) \ BHND_NV_ASSERT( \ value->data_storage == BHND_NVRAM_VAL_DATA_NONE && \ value->data_len == 0 && \ value->data.ptr == NULL, \ ("previously initialized value")) /** Return true if BHND_NVRAM_VAL_BORROW_DATA or BHND_NVRAM_VAL_STATIC_DATA is * set in @p _flags (e.g. we should attempt to directly reference external * data */ #define BHND_NVRAM_VAL_EXTREF_BORROWED_DATA(_flags) \ (((_flags) & BHND_NVRAM_VAL_BORROW_DATA) || \ ((_flags) & BHND_NVRAM_VAL_STATIC_DATA)) /** Flags permitted when performing val-based initialization via * bhnd_nvram_val_convert_init() or bhnd_nvram_val_convert_new() */ #define BHND_NVRAM_VALID_CONV_FLAGS \ (BHND_NVRAM_VAL_FIXED | \ BHND_NVRAM_VAL_DYNAMIC | \ BHND_NVRAM_VAL_COPY_DATA) /** Returns true if @p _val must be copied in bhnd_nvram_val_copy(), false * if its reference count may be safely incremented */ #define BHND_NVRAM_VAL_NEED_COPY(_val) \ ((_val)->val_storage == BHND_NVRAM_VAL_STORAGE_AUTO || \ (_val)->data_storage == BHND_NVRAM_VAL_DATA_EXT_WEAK) volatile u_int refs; /**< reference count */ bhnd_nvram_val_storage val_storage; /**< value structure storage */ const bhnd_nvram_val_fmt *fmt; /**< value format */ bhnd_nvram_val_data_storage data_storage; /**< data storage */ bhnd_nvram_type data_type; /**< data type */ size_t data_len; /**< data size */ /* Shared NULL value instance */ bhnd_nvram_val bhnd_nvram_val_null = { .refs = 1, .val_storage = BHND_NVRAM_VAL_STORAGE_STATIC, .fmt = &bhnd_nvram_val_null_fmt, .data_storage = BHND_NVRAM_VAL_DATA_INLINE, .data_type = BHND_NVRAM_TYPE_NULL, .data_len = 0, }; /** * Return the human-readable name of @p fmt. */ const char * bhnd_nvram_val_fmt_name(const bhnd_nvram_val_fmt *fmt) { return (fmt->name); } /** * Return the default format for values of @p type. */ const bhnd_nvram_val_fmt * bhnd_nvram_val_default_fmt(bhnd_nvram_type type) { switch (type) { case BHND_NVRAM_TYPE_UINT8: return (&bhnd_nvram_val_uint8_fmt); case BHND_NVRAM_TYPE_UINT16: return (&bhnd_nvram_val_uint16_fmt); case BHND_NVRAM_TYPE_UINT32: return (&bhnd_nvram_val_uint32_fmt); case BHND_NVRAM_TYPE_UINT64: return (&bhnd_nvram_val_uint64_fmt); case BHND_NVRAM_TYPE_INT8: return (&bhnd_nvram_val_int8_fmt); case BHND_NVRAM_TYPE_INT16: return (&bhnd_nvram_val_int16_fmt); case BHND_NVRAM_TYPE_INT32: return (&bhnd_nvram_val_int32_fmt); case BHND_NVRAM_TYPE_INT64: return (&bhnd_nvram_val_int64_fmt); case BHND_NVRAM_TYPE_CHAR: return (&bhnd_nvram_val_char_fmt); case BHND_NVRAM_TYPE_STRING: return (&bhnd_nvram_val_string_fmt); case BHND_NVRAM_TYPE_BOOL: return (&bhnd_nvram_val_bool_fmt); case BHND_NVRAM_TYPE_NULL: return (&bhnd_nvram_val_null_fmt); case BHND_NVRAM_TYPE_DATA: return (&bhnd_nvram_val_data_fmt); case BHND_NVRAM_TYPE_UINT8_ARRAY: return (&bhnd_nvram_val_uint8_array_fmt); case BHND_NVRAM_TYPE_UINT16_ARRAY: return (&bhnd_nvram_val_uint16_array_fmt); case BHND_NVRAM_TYPE_UINT32_ARRAY: return (&bhnd_nvram_val_uint32_array_fmt); case BHND_NVRAM_TYPE_UINT64_ARRAY: return (&bhnd_nvram_val_uint64_array_fmt); case BHND_NVRAM_TYPE_INT8_ARRAY: return (&bhnd_nvram_val_int8_array_fmt); case BHND_NVRAM_TYPE_INT16_ARRAY: return (&bhnd_nvram_val_int16_array_fmt); case BHND_NVRAM_TYPE_INT32_ARRAY: return (&bhnd_nvram_val_int32_array_fmt); case BHND_NVRAM_TYPE_INT64_ARRAY: return (&bhnd_nvram_val_int64_array_fmt); case BHND_NVRAM_TYPE_CHAR_ARRAY: return (&bhnd_nvram_val_char_array_fmt); case BHND_NVRAM_TYPE_STRING_ARRAY: return (&bhnd_nvram_val_string_array_fmt); case BHND_NVRAM_TYPE_BOOL_ARRAY: return (&bhnd_nvram_val_bool_array_fmt); } /* Quiesce gcc4.2 */ BHND_NV_PANIC("bhnd nvram type %u unknown", type); } /** * Determine whether @p fmt (or new format delegated to by @p fmt) is * capable of direct initialization from buffer @p inp. * * @param[in,out] fmt Indirect pointer to the NVRAM value format. If * the format instance cannot handle the data type * directly, it may delegate to a new format * instance. On success, this parameter will be * set to the format that should be used when * performing initialization from @p inp. * @param inp Input data. * @param ilen Input data length. * @param itype Input data type. * * @retval 0 If initialization from @p inp is supported. * @retval EFTYPE If initialization from @p inp is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt, const void *inp, size_t ilen, bhnd_nvram_type itype) { const bhnd_nvram_val_fmt *ofmt, *nfmt; int error; nfmt = ofmt = *fmt; /* Validate alignment */ if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype))) return (error); /* If the format does not provide a filter function, it only supports * direct initialization from its native type */ if (ofmt->op_filter == NULL) { if (itype == ofmt->native_type) return (0); return (EFTYPE); } /* Use the filter function to determine whether direct initialization * from itype is permitted */ error = ofmt->op_filter(&nfmt, inp, ilen, itype); if (error) return (error); /* Retry filter with new format? */ if (ofmt != nfmt) { error = bhnd_nvram_val_fmt_filter(&nfmt, inp, ilen, itype); if (error) return (error); /* Success -- provide delegated format to caller */ *fmt = nfmt; } /* Value can be initialized with provided format and input type */ return (0); } /* Common initialization support for bhnd_nvram_val_init() and * bhnd_nvram_val_new() */ static int bhnd_nvram_val_init_common(bhnd_nvram_val *value, bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *outp; bhnd_nvram_type otype; size_t olen; int error; /* If the value format is unspecified, we use the default format * for the input data type */ if (fmt == NULL) fmt = bhnd_nvram_val_default_fmt(itype); /* Determine expected data type, and allow the format to delegate to * a new format instance */ if ((error = bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype))) { /* Direct initialization from the provided input type is * not supported; alue must be initialized with the format's * native type */ otype = fmt->native_type; } else { /* Value can be initialized with provided input type */ otype = itype; } /* Initialize value instance */ *value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage); /* If input data already in native format, init directly. */ if (otype == itype) { error = bhnd_nvram_val_set(value, inp, ilen, itype, flags); if (error) return (error); return (0); } /* Determine size when encoded in native format */ error = bhnd_nvram_value_coerce(inp, ilen, itype, NULL, &olen, otype); if (error) return (error); /* Fetch reference to (or allocate) an appropriately sized buffer */ outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags); if (outp == NULL) return (ENOMEM); /* Perform encode */ error = bhnd_nvram_value_coerce(inp, ilen, itype, outp, &olen, otype); if (error) return (error); return (0); } /** * Initialize an externally allocated instance of @p value with @p fmt from the * given @p inp buffer of @p itype and @p ilen. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param value The externally allocated value instance to be * initialized. * @param fmt The value's format, or NULL to use the default format * for @p itype. * @param inp Input buffer. * @param ilen Input buffer length. * @param itype Input buffer type. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p itype is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion would overflow (or underflow) the * @p fmt representation. */ int bhnd_nvram_val_init(bhnd_nvram_val *value, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { int error; error = bhnd_nvram_val_init_common(value, BHND_NVRAM_VAL_STORAGE_AUTO, fmt, inp, ilen, itype, flags); if (error) bhnd_nvram_val_release(value); return (error); } /** * Allocate a value instance with @p fmt, and attempt to initialize its internal * representation from the given @p inp buffer of @p itype and @p ilen. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param[out] value On success, the allocated value instance. * @param fmt The value's format, or NULL to use the default format * for @p itype. * @param inp Input buffer. * @param ilen Input buffer length. * @param itype Input buffer type. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p itype is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion would overflow (or underflow) the * @p fmt representation. */ int bhnd_nvram_val_new(bhnd_nvram_val **value, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { int error; /* Allocate new instance */ if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL) return (ENOMEM); /* Perform common initialization. */ error = bhnd_nvram_val_init_common(*value, BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, inp, ilen, itype, flags); if (error) { /* Will also free() the value allocation */ bhnd_nvram_val_release(*value); } return (error); } /* Common initialization support for bhnd_nvram_val_convert_init() and * bhnd_nvram_val_convert_new() */ static int bhnd_nvram_val_convert_common(bhnd_nvram_val *value, bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { const void *inp; void *outp; bhnd_nvram_type itype, otype; size_t ilen, olen; int error; /* Determine whether direct initialization from the source value's * existing data type is supported by the new format */ inp = bhnd_nvram_val_bytes(src, &ilen, &itype); if (bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype) == 0) { /* Adjust value flags based on the source data storage */ switch (src->data_storage) { case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_ALLOC: break; case BHND_NVRAM_VAL_DATA_EXT_STATIC: /* If the source data has static storage duration, * we should apply that transitively */ if (flags & BHND_NVRAM_VAL_BORROW_DATA) flags |= BHND_NVRAM_VAL_STATIC_DATA; break; } /* Delegate to standard initialization */ return (bhnd_nvram_val_init_common(value, val_storage, fmt, inp, ilen, itype, flags)); } /* Value must be initialized with the format's native type */ otype = fmt->native_type; /* Initialize value instance */ *value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage); /* Determine size when encoded in native format */ if ((error = bhnd_nvram_val_encode(src, NULL, &olen, otype))) return (error); /* Fetch reference to (or allocate) an appropriately sized buffer */ outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags); if (outp == NULL) return (ENOMEM); /* Perform encode */ if ((error = bhnd_nvram_val_encode(src, outp, &olen, otype))) return (error); return (0); } /** * Initialize an externally allocated instance of @p value with @p fmt, and * attempt to initialize its internal representation from the given @p src * value. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param value The externally allocated value instance to be * initialized. * @param fmt The value's format. * @param src Input value to be converted. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p src is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion of @p src would overflow * (or underflow) the @p fmt representation. */ int bhnd_nvram_val_convert_init(bhnd_nvram_val *value, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { int error; error = bhnd_nvram_val_convert_common(value, BHND_NVRAM_VAL_STORAGE_AUTO, fmt, src, flags); if (error) bhnd_nvram_val_release(value); return (error); } /** * Allocate a value instance with @p fmt, and attempt to initialize its internal * representation from the given @p src value. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param[out] value On success, the allocated value instance. * @param fmt The value's format. * @param src Input value to be converted. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p src is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion of @p src would overflow * (or underflow) the @p fmt representation. */ int bhnd_nvram_val_convert_new(bhnd_nvram_val **value, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { int error; /* Allocate new instance */ if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL) return (ENOMEM); /* Perform common initialization. */ error = bhnd_nvram_val_convert_common(*value, BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, src, flags); if (error) { /* Will also free() the value allocation */ bhnd_nvram_val_release(*value); } return (error); } /** * Copy or retain a reference to @p value. * * On success, the caller is responsible for freeing the result via * bhnd_nvram_val_release(). * * @param value The value to be copied (or retained). * * @retval bhnd_nvram_val if @p value was successfully copied or retained. * @retval NULL if allocation failed. */ bhnd_nvram_val * bhnd_nvram_val_copy(bhnd_nvram_val *value) { bhnd_nvram_val *result; const void *bytes; bhnd_nvram_type type; size_t len; uint32_t flags; int error; switch (value->val_storage) { case BHND_NVRAM_VAL_STORAGE_STATIC: /* If static, can return as-is */ return (value); case BHND_NVRAM_VAL_STORAGE_DYNAMIC: if (!BHND_NVRAM_VAL_NEED_COPY(value)) { refcount_acquire(&value->refs); return (value); } /* Perform copy below */ break; case BHND_NVRAM_VAL_STORAGE_AUTO: BHND_NV_ASSERT(value->refs == 1, ("non-allocated value has " "active refcount (%u)", value->refs)); /* Perform copy below */ break; } /* Compute the new value's flags based on the source value */ switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_ALLOC: /* Copy the source data and permit additional allocation if the * value cannot be represented inline */ flags = BHND_NVRAM_VAL_COPY_DATA|BHND_NVRAM_VAL_DYNAMIC; break; case BHND_NVRAM_VAL_DATA_EXT_STATIC: flags = BHND_NVRAM_VAL_STATIC_DATA; break; default: BHND_NV_PANIC("invalid storage type: %d", value->data_storage); } /* Allocate new value copy */ bytes = bhnd_nvram_val_bytes(value, &len, &type); error = bhnd_nvram_val_new(&result, value->fmt, bytes, len, type, flags); if (error) { BHND_NV_LOG("copy failed: %d", error); return (NULL); } return (result); } /** * Release a reference to @p value. * * If this is the last reference, all associated resources will be freed. * * @param value The value to be released. */ void bhnd_nvram_val_release(bhnd_nvram_val *value) { BHND_NV_ASSERT(value->refs >= 1, ("value over-released")); /* Skip if value is static */ if (value->val_storage == BHND_NVRAM_VAL_STORAGE_STATIC) return; /* Drop reference */ if (!refcount_release(&value->refs)) return; /* Free allocated external representation data */ switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_EXT_ALLOC: bhnd_nv_free(__DECONST(void *, value->data.ptr)); break; case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_STATIC: /* Nothing to free */ break; } /* Free instance if dynamically allocated */ if (value->val_storage == BHND_NVRAM_VAL_STORAGE_DYNAMIC) bhnd_nv_free(value); } /** * Standard BHND_NVRAM_TYPE_NULL encoding implementation. */ static int bhnd_nvram_val_encode_null(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { size_t limit, nbytes; BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_NULL, ("unsupported type: %d", itype)); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; nbytes = 0; /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Can be directly encoded as a zero-length NULL value */ nbytes = 0; break; default: /* Not representable */ return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Standard BHND_NVRAM_TYPE_BOOL encoding implementation. */ static int bhnd_nvram_val_encode_bool(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_bool_t bval; size_t limit, nbytes, nelem; int error; BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_BOOL, ("unsupported type: %d", itype)); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Must be exactly one element in input */ if ((error = bhnd_nvram_value_nelem(inp, ilen, itype, &nelem))) return (error); if (nelem != 1) return (EFTYPE); /* Fetch (and normalize) boolean value */ bval = (*(const bhnd_nvram_bool_t *)inp != 0) ? true : false; /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* False can be directly encoded as a zero-length NULL value */ if (bval != false) return (EFTYPE); nbytes = 0; break; case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: { /* Can encode as "true" or "false" */ const char *str = bval ? "true" : "false"; nbytes = strlen(str) + 1; if (limit > nbytes) strcpy(outp, str); break; } default: /* If output type is an integer, we can delegate to standard * integer encoding to encode as zero or one. */ if (bhnd_nvram_is_int_type(otype)) { uint8_t ival = bval ? 1 : 0; return (bhnd_nvram_val_encode_int(&ival, sizeof(ival), BHND_NVRAM_TYPE_UINT8, outp, olen, otype)); } /* Otherwise not representable */ return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Standard BHND_NVRAM_TYPE_DATA encoding implementation. */ static int bhnd_nvram_val_encode_data(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_DATA, ("unsupported type: %d", itype)); /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: /* If encoding as a string, produce an EFI-style hexadecimal * byte array (HF1F...) by interpreting the octet string * as an array of uint8 values */ return (bhnd_nvram_value_printf("H%[]02hhX", inp, ilen, BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, "")); default: /* Fall back on direct interpretation as an array of 8-bit * integers array */ return (bhnd_nvram_value_coerce(inp, ilen, BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, otype)); } } /** * Standard string/char array/char encoding implementation. * * Input type must be one of: * - BHND_NVRAM_TYPE_STRING * - BHND_NVRAM_TYPE_CHAR * - BHND_NVRAM_TYPE_CHAR_ARRAY */ static int bhnd_nvram_val_encode_string(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { const char *cstr; bhnd_nvram_type otype_base; size_t cstr_size, cstr_len; size_t limit, nbytes; BHND_NV_ASSERT( itype == BHND_NVRAM_TYPE_STRING || itype == BHND_NVRAM_TYPE_CHAR || itype == BHND_NVRAM_TYPE_CHAR_ARRAY, ("unsupported type: %d", itype)); cstr = inp; cstr_size = ilen; nbytes = 0; otype_base = bhnd_nvram_base_type(otype); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Determine string length, minus trailing NUL (if any) */ cstr_len = strnlen(cstr, cstr_size); /* Parse the string data and write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Only an empty string may be represented as a NULL value */ if (cstr_len != 0) return (EFTYPE); *olen = 0; return (0); case BHND_NVRAM_TYPE_CHAR: case BHND_NVRAM_TYPE_CHAR_ARRAY: /* String must contain exactly 1 non-terminating-NUL character * to be represented as a single char */ if (!bhnd_nvram_is_array_type(otype)) { if (cstr_len != 1) return (EFTYPE); } /* Copy out the characters directly (excluding trailing NUL) */ for (size_t i = 0; i < cstr_len; i++) { if (limit > nbytes) *((uint8_t *)outp + nbytes) = cstr[i]; nbytes++; } /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); case BHND_NVRAM_TYPE_BOOL: case BHND_NVRAM_TYPE_BOOL_ARRAY: { const char *p; size_t plen; bhnd_nvram_bool_t bval; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Parse string representation */ if (strncasecmp(p, "true", plen) == 0 || strncasecmp(p, "yes", plen) == 0 || strncmp(p, "1", plen) == 0) { bval = true; } else if (strncasecmp(p, "false", plen) == 0 || strncasecmp(p, "no", plen) == 0 || strncmp(p, "0", plen) == 0) { bval = false; } else { /* Not a recognized boolean string */ return (EFTYPE); } /* Write to output */ nbytes = sizeof(bhnd_nvram_bool_t); if (limit >= nbytes) *((bhnd_nvram_bool_t *)outp) = bval; /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); } case BHND_NVRAM_TYPE_DATA: { const char *p; size_t plen, parsed_len; int error; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Check for EFI-style hexadecimal byte array string format. * Must have a 'H' prefix */ if (plen < 1 || bhnd_nv_toupper(*p) != 'H') return (EFTYPE); /* Skip leading 'H' */ p++; plen--; /* Parse the input string's two-char octets until the end * of input is reached. The last octet may contain only * one char */ while (plen > 0) { uint8_t byte; size_t byte_len = sizeof(byte); /* Parse next two-character hex octet */ error = bhnd_nvram_parse_int(p, bhnd_nv_ummin(plen, 2), 16, &parsed_len, &byte, &byte_len, otype_base); if (error) { BHND_NV_DEBUG("error parsing '%.*s' as " "integer: %d\n", BHND_NV_PRINT_WIDTH(plen), p, error); return (error); } /* Write to output */ if (limit > nbytes) *((uint8_t *)outp + nbytes) = byte; nbytes++; /* Advance input */ p += parsed_len; plen -= parsed_len; } /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); } case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT8_ARRAY: case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT16_ARRAY: case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT32_ARRAY: case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_UINT64_ARRAY: case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT8_ARRAY: case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT16_ARRAY: case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT32_ARRAY: case BHND_NVRAM_TYPE_INT64: case BHND_NVRAM_TYPE_INT64_ARRAY: { const char *p; size_t plen, parsed_len; int error; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Try to parse the integer value */ error = bhnd_nvram_parse_int(p, plen, 0, &parsed_len, outp, olen, otype_base); if (error) { BHND_NV_DEBUG("error parsing '%.*s' as integer: %d\n", BHND_NV_PRINT_WIDTH(plen), p, error); return (error); } /* Do additional bytes remain unparsed? */ if (plen != parsed_len) { BHND_NV_DEBUG("error parsing '%.*s' as a single " "integer value; trailing garbage '%.*s'\n", BHND_NV_PRINT_WIDTH(plen), p, BHND_NV_PRINT_WIDTH(plen-parsed_len), p+parsed_len); return (EFTYPE); } return (0); } case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: /* Copy out the string representation as-is */ *olen = cstr_size; /* Need additional space for trailing NUL? */ if (cstr_len == cstr_size) (*olen)++; /* Skip output? */ if (outp == NULL) return (0); /* Verify required length */ if (limit < *olen) return (ENOMEM); /* Copy and NUL terminate */ strncpy(outp, cstr, cstr_len); *((char *)outp + cstr_len) = '\0'; return (0); } BHND_NV_PANIC("unknown type %s", bhnd_nvram_type_name(otype)); } /** * Standard integer encoding implementation. */ static int bhnd_nvram_val_encode_int(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_type otype_base; size_t limit, nbytes; bool itype_signed, otype_signed, otype_int; union { uint64_t u64; int64_t i64; } intv; BHND_NV_ASSERT(bhnd_nvram_is_int_type(itype), ("non-integer type")); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Fetch output type info */ otype_base = bhnd_nvram_base_type(otype); otype_int = bhnd_nvram_is_int_type(otype); otype_signed = bhnd_nvram_is_signed_type(otype_base); /* * Promote integer value to a common 64-bit representation. */ switch (itype) { case BHND_NVRAM_TYPE_UINT8: if (ilen != sizeof(uint8_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint8_t *)inp; break; case BHND_NVRAM_TYPE_UINT16: if (ilen != sizeof(uint16_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint16_t *)inp; break; case BHND_NVRAM_TYPE_UINT32: if (ilen != sizeof(uint32_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint32_t *)inp; break; case BHND_NVRAM_TYPE_UINT64: if (ilen != sizeof(uint64_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint64_t *)inp; break; case BHND_NVRAM_TYPE_INT8: if (ilen != sizeof(int8_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int8_t *)inp; break; case BHND_NVRAM_TYPE_INT16: if (ilen != sizeof(int16_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int16_t *)inp; break; case BHND_NVRAM_TYPE_INT32: if (ilen != sizeof(int32_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int32_t *)inp; break; case BHND_NVRAM_TYPE_INT64: if (ilen != sizeof(int32_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int32_t *)inp; break; default: BHND_NV_PANIC("invalid type %d\n", itype); } /* Perform signed/unsigned conversion */ if (itype_signed && otype_int && !otype_signed) { if (intv.i64 < 0) { /* Can't represent negative value */ BHND_NV_LOG("cannot represent %" PRId64 " as %s\n", intv.i64, bhnd_nvram_type_name(otype)); return (ERANGE); } /* Convert to unsigned representation */ intv.u64 = intv.i64; } else if (!itype_signed && otype_int && otype_signed) { /* Handle unsigned -> signed coercions */ if (intv.u64 > INT64_MAX) { /* Can't represent positive value */ BHND_NV_LOG("cannot represent %" PRIu64 " as %s\n", intv.u64, bhnd_nvram_type_name(otype)); return (ERANGE); } /* Convert to signed representation */ intv.i64 = intv.u64; } /* Write output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Cannot encode an integer value as NULL */ return (EFTYPE); case BHND_NVRAM_TYPE_BOOL: { bhnd_nvram_bool_t bval; if (intv.u64 == 0 || intv.u64 == 1) { bval = intv.u64; } else { /* Encoding as a bool would lose information */ return (ERANGE); } nbytes = sizeof(bhnd_nvram_bool_t); if (limit >= nbytes) *((bhnd_nvram_bool_t *)outp) = bval; break; } case BHND_NVRAM_TYPE_CHAR: case BHND_NVRAM_TYPE_CHAR_ARRAY: case BHND_NVRAM_TYPE_DATA: case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT8_ARRAY: if (intv.u64 > UINT8_MAX) return (ERANGE); nbytes = sizeof(uint8_t); if (limit >= nbytes) *((uint8_t *)outp) = (uint8_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT16_ARRAY: if (intv.u64 > UINT16_MAX) return (ERANGE); nbytes = sizeof(uint16_t); if (limit >= nbytes) *((uint16_t *)outp) = (uint16_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT32_ARRAY: if (intv.u64 > UINT32_MAX) return (ERANGE); nbytes = sizeof(uint32_t); if (limit >= nbytes) *((uint32_t *)outp) = (uint32_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_UINT64_ARRAY: nbytes = sizeof(uint64_t); if (limit >= nbytes) *((uint64_t *)outp) = intv.u64; break; case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT8_ARRAY: if (intv.i64 < INT8_MIN || intv.i64 > INT8_MAX) return (ERANGE); nbytes = sizeof(int8_t); if (limit >= nbytes) *((int8_t *)outp) = (int8_t)intv.i64; break; case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT16_ARRAY: if (intv.i64 < INT16_MIN || intv.i64 > INT16_MAX) return (ERANGE); nbytes = sizeof(int16_t); if (limit >= nbytes) *((int16_t *)outp) = (int16_t)intv.i64; break; case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT32_ARRAY: if (intv.i64 < INT32_MIN || intv.i64 > INT32_MAX) return (ERANGE); nbytes = sizeof(int32_t); if (limit >= nbytes) *((int32_t *)outp) = (int32_t)intv.i64; break; case BHND_NVRAM_TYPE_INT64: case BHND_NVRAM_TYPE_INT64_ARRAY: nbytes = sizeof(int64_t); if (limit >= nbytes) *((int64_t *)outp) = intv.i64; break; case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: { ssize_t len; /* Attempt to write the entry + NUL */ if (otype_signed) { len = snprintf(outp, limit, "%" PRId64, intv.i64); } else { len = snprintf(outp, limit, "%" PRIu64, intv.u64); } if (len < 0) { BHND_NV_LOG("snprintf() failed: %zd\n", len); return (EFTYPE); } /* Set total length to the formatted string length, plus * trailing NUL */ nbytes = len + 1; break; } default: BHND_NV_LOG("unknown type %s\n", bhnd_nvram_type_name(otype)); return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Encode the given @p value as @p otype, writing the result to @p outp. * * @param value The value to be encoded. * @param[out] outp On success, the value will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The data type to be written to @p outp. * * @retval 0 success * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to @p otype is * impossible. * @retval ERANGE If value coercion would overflow (or underflow) the * a @p otype representation. */ int bhnd_nvram_val_encode(bhnd_nvram_val *value, void *outp, size_t *olen, bhnd_nvram_type otype) { /* Prefer format implementation */ if (value->fmt->op_encode != NULL) return (value->fmt->op_encode(value, outp, olen, otype)); return (bhnd_nvram_val_generic_encode(value, outp, olen, otype)); } /** * Encode the given @p value's element as @p otype, writing the result to * @p outp. * * @param inp The element to be be encoded. Must be a value * previously returned by bhnd_nvram_val_next() * or bhnd_nvram_val_elem(). * @param ilen The size of @p inp, as returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(). * @param[out] outp On success, the value will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The data type to be written to @p outp. * * @retval 0 success * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to @p otype is * impossible. * @retval ERANGE If value coercion would overflow (or underflow) the * a @p otype representation. */ int bhnd_nvram_val_encode_elem(bhnd_nvram_val *value, const void *inp, size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype) { /* Prefer format implementation */ if (value->fmt->op_encode_elem != NULL) { return (value->fmt->op_encode_elem(value, inp, ilen, outp, olen, otype)); } return (bhnd_nvram_val_generic_encode_elem(value, inp, ilen, outp, olen, otype)); } /** * Return the type, size, and a pointer to the internal representation * of @p value. * * @param value The value to be queried. * @param[out] olen Size of the returned data, in bytes. * @param[out] otype Data type. */ const void * bhnd_nvram_val_bytes(bhnd_nvram_val *value, size_t *olen, bhnd_nvram_type *otype) { /* Provide type and length */ *otype = value->data_type; *olen = value->data_len; switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_EXT_ALLOC: case BHND_NVRAM_VAL_DATA_EXT_STATIC: case BHND_NVRAM_VAL_DATA_EXT_WEAK: /* Return a pointer to external storage */ return (value->data.ptr); case BHND_NVRAM_VAL_DATA_INLINE: /* Return a pointer to inline storage */ return (&value->data); case BHND_NVRAM_VAL_DATA_NONE: BHND_NV_PANIC("uninitialized value"); } BHND_NV_PANIC("unknown storage type: %d", value->data_storage); } /** * Iterate over all array elements in @p value. * * @param value The value to be iterated * @param prev A value pointer previously returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(), * or NULL to begin iteration at the first element. * @param[in,out] olen If @p prev is non-NULL, @p olen must be a * pointer to the length previously returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(). * On success, will be set to the next element's * length, in bytes. * * @retval non-NULL A borrowed reference to the element data. * @retval NULL If the end of the element array is reached. */ const void * bhnd_nvram_val_next(bhnd_nvram_val *value, const void *prev, size_t *olen) { /* Prefer the format implementation */ if (value->fmt->op_next != NULL) return (value->fmt->op_next(value, prev, olen)); return (bhnd_nvram_val_generic_next(value, prev, olen)); } /** * Return the value's data type. * * @param value The value to be queried. */ bhnd_nvram_type bhnd_nvram_val_type(bhnd_nvram_val *value) { return (value->data_type); } /** * Return value's element data type. * * @param value The value to be queried. */ bhnd_nvram_type bhnd_nvram_val_elem_type(bhnd_nvram_val *value) { return (bhnd_nvram_base_type(value->data_type)); } /** * Return the total number of elements represented by @p value. */ size_t bhnd_nvram_val_nelem(bhnd_nvram_val *value) { const void *bytes; bhnd_nvram_type type; size_t nelem, len; int error; /* Prefer format implementation */ if (value->fmt->op_nelem != NULL) return (value->fmt->op_nelem(value)); /* * If a custom op_next() is defined, bhnd_nvram_value_nelem() almost * certainly cannot produce a valid element count; it assumes a standard * data format that may not apply when custom iteration is required. * * Instead, use bhnd_nvram_val_next() to parse the backing data and * produce a total count. */ if (value->fmt->op_next != NULL) { const void *next; next = NULL; nelem = 0; while ((next = bhnd_nvram_val_next(value, next, &len)) != NULL) nelem++; return (nelem); } /* Otherwise, compute the standard element count */ bytes = bhnd_nvram_val_bytes(value, &len, &type); if ((error = bhnd_nvram_value_nelem(bytes, len, type, &nelem))) { /* Should always succeed */ BHND_NV_PANIC("error calculating element count for type '%s' " "with length %zu: %d\n", bhnd_nvram_type_name(type), len, error); } return (nelem); } /** * Generic implementation of bhnd_nvram_val_op_encode(), compatible with * all supported NVRAM data types. */ int bhnd_nvram_val_generic_encode(bhnd_nvram_val *value, void *outp, size_t *olen, bhnd_nvram_type otype) { const void *inp; bhnd_nvram_type itype; size_t ilen; const void *next; bhnd_nvram_type otype_base; size_t limit, nelem, nbytes; size_t next_len; int error; nbytes = 0; nelem = 0; otype_base = bhnd_nvram_base_type(otype); inp = bhnd_nvram_val_bytes(value, &ilen, &itype); /* * Normally, an array type is not universally representable as * non-array type. * * As exceptions, we support conversion directly to/from: * - CHAR_ARRAY/STRING: * ->STRING Interpret the character array as a * non-NUL-terminated string. * ->CHAR_ARRAY Trim the trailing NUL from the string. */ #define BHND_NV_IS_ISO_CONV(_lhs, _rhs) \ ((itype == BHND_NVRAM_TYPE_ ## _lhs && \ otype == BHND_NVRAM_TYPE_ ## _rhs) || \ (itype == BHND_NVRAM_TYPE_ ## _rhs && \ otype == BHND_NVRAM_TYPE_ ## _lhs)) if (BHND_NV_IS_ISO_CONV(CHAR_ARRAY, STRING)) { return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen, otype)); } #undef BHND_NV_IS_ISO_CONV /* * If both input and output are non-array types, try to encode them * without performing element iteration. */ if (!bhnd_nvram_is_array_type(itype) && !bhnd_nvram_is_array_type(otype)) { return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen, otype)); } /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Iterate over our array elements and encode as the requested * type */ next = NULL; while ((next = bhnd_nvram_val_next(value, next, &next_len))) { void *elem_outp; size_t elem_nbytes; /* If the output type is not an array type, we can only encode * one element */ nelem++; if (nelem > 1 && !bhnd_nvram_is_array_type(otype)) { return (EFTYPE); } /* Determine output offset / limit */ if (nbytes >= limit) { elem_nbytes = 0; elem_outp = NULL; } else { elem_nbytes = limit - nbytes; elem_outp = (uint8_t *)outp + nbytes; } /* Attempt encode */ error = bhnd_nvram_val_encode_elem(value, next, next_len, elem_outp, &elem_nbytes, otype_base); /* If encoding failed for any reason other than ENOMEM (which * we'll detect and report below), return immediately */ if (error && error != ENOMEM) return (error); /* Add to total length */ if (SIZE_MAX - nbytes < elem_nbytes) return (EFTYPE); /* would overflow size_t */ nbytes += elem_nbytes; } /* Provide the actual length */ *olen = nbytes; /* If no output was requested, nothing left to do */ if (outp == NULL) return (0); /* Otherwise, report a memory error if the output buffer was too * small */ if (limit < nbytes) return (ENOMEM); return (0); } /** * Generic implementation of bhnd_nvram_val_op_encode_elem(), compatible with * all supported NVRAM data types. */ int bhnd_nvram_val_generic_encode_elem(bhnd_nvram_val *value, const void *inp, size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_type itype; itype = bhnd_nvram_val_elem_type(value); switch (itype) { case BHND_NVRAM_TYPE_NULL: return (bhnd_nvram_val_encode_null(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_DATA: return (bhnd_nvram_val_encode_data(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_CHAR: return (bhnd_nvram_val_encode_string(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_BOOL: return (bhnd_nvram_val_encode_bool(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT64: return (bhnd_nvram_val_encode_int(inp, ilen, itype, outp, olen, otype)); default: BHND_NV_PANIC("missing encode_elem() implementation"); } } /** * Generic implementation of bhnd_nvram_val_op_next(), compatible with * all supported NVRAM data types. */ const void * bhnd_nvram_val_generic_next(bhnd_nvram_val *value, const void *prev, size_t *olen) { const uint8_t *inp; bhnd_nvram_type itype; size_t ilen; /* Iterate over the backing representation */ inp = bhnd_nvram_val_bytes(value, &ilen, &itype); return (bhnd_nvram_value_array_next(inp, ilen, itype, prev, olen)); } /** * Initialize the representation of @p value with @p ptr. * * @param value The value to be initialized. * @param inp The external representation. * @param ilen The external representation length, in bytes. * @param itype The external representation's data type. * @param flags Value flags. * * @retval 0 success. * @retval ENOMEM if allocation fails * @retval EFTYPE if @p itype is not an array type, and @p ilen is not * equal to the size of a single element of @p itype. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *bytes; int error; BHND_NVRAM_VAL_ASSERT_EMPTY(value); /* Validate alignment */ if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype))) return (error); /* Reference the external data */ if ((flags & BHND_NVRAM_VAL_BORROW_DATA) || (flags & BHND_NVRAM_VAL_STATIC_DATA)) { if (flags & BHND_NVRAM_VAL_STATIC_DATA) value->data_storage = BHND_NVRAM_VAL_DATA_EXT_STATIC; else value->data_storage = BHND_NVRAM_VAL_DATA_EXT_WEAK; value->data.ptr = inp; value->data_type = itype; value->data_len = ilen; return (0); } /* Fetch reference to (or allocate) an appropriately sized buffer */ bytes = bhnd_nvram_val_alloc_bytes(value, ilen, itype, flags); if (bytes == NULL) return (ENOMEM); /* Copy data */ memcpy(bytes, inp, ilen); return (0); } /** * Initialize the internal inline representation of @p value with a copy of * the data referenced by @p inp of @p itype. * * If @p inp is NULL, @p itype and @p ilen will be validated, but no data will * be copied. * * @param value The value to be initialized. * @param inp The input data to be copied, or NULL to verify * that data of @p ilen and @p itype can be represented * inline. * @param ilen The size of the external buffer to be allocated. * @param itype The type of the external buffer to be allocated. * * @retval 0 success * @retval ENOMEM if @p ilen is too large to be represented inline. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_set_inline(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype) { BHND_NVRAM_VAL_ASSERT_EMPTY(value); #define NV_STORE_INIT_INLINE() do { \ value->data_len = ilen; \ value->data_type = itype; \ } while(0) #define NV_STORE_INLINE(_type, _dest) do { \ if (ilen != sizeof(_type)) \ return (EFAULT); \ \ if (inp != NULL) { \ value->data._dest[0] = *(const _type *)inp; \ NV_STORE_INIT_INLINE(); \ } \ } while (0) #define NV_COPY_ARRRAY_INLINE(_type, _dest) do { \ if (ilen % sizeof(_type) != 0) \ return (EFAULT); \ \ if (ilen > nitems(value->data. _dest)) \ return (ENOMEM); \ \ if (inp == NULL) \ return (0); \ \ memcpy(&value->data._dest, inp, ilen); \ if (inp != NULL) { \ memcpy(&value->data._dest, inp, ilen); \ NV_STORE_INIT_INLINE(); \ } \ } while (0) /* Attempt to copy to inline storage */ switch (itype) { case BHND_NVRAM_TYPE_NULL: if (ilen != 0) return (EFAULT); /* Nothing to copy */ NV_STORE_INIT_INLINE(); return (0); case BHND_NVRAM_TYPE_CHAR: NV_STORE_INLINE(uint8_t, ch); return (0); case BHND_NVRAM_TYPE_BOOL: NV_STORE_INLINE(bhnd_nvram_bool_t, b); return(0); case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_INT8: NV_STORE_INLINE(uint8_t, u8); return (0); case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_INT16: NV_STORE_INLINE(uint16_t, u16); return (0); case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_INT32: NV_STORE_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_INT64: NV_STORE_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_CHAR_ARRAY: NV_COPY_ARRRAY_INLINE(uint8_t, ch); return (0); case BHND_NVRAM_TYPE_DATA: case BHND_NVRAM_TYPE_UINT8_ARRAY: case BHND_NVRAM_TYPE_INT8_ARRAY: NV_COPY_ARRRAY_INLINE(uint8_t, u8); return (0); case BHND_NVRAM_TYPE_UINT16_ARRAY: case BHND_NVRAM_TYPE_INT16_ARRAY: NV_COPY_ARRRAY_INLINE(uint16_t, u16); return (0); case BHND_NVRAM_TYPE_UINT32_ARRAY: case BHND_NVRAM_TYPE_INT32_ARRAY: NV_COPY_ARRRAY_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_UINT64_ARRAY: case BHND_NVRAM_TYPE_INT64_ARRAY: NV_COPY_ARRRAY_INLINE(uint64_t, u64); return (0); case BHND_NVRAM_TYPE_BOOL_ARRAY: NV_COPY_ARRRAY_INLINE(bhnd_nvram_bool_t, b); return(0); case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: if (ilen > sizeof(value->data.ch)) return (ENOMEM); if (inp != NULL) { memcpy(&value->data.ch, inp, ilen); NV_STORE_INIT_INLINE(); } return (0); } #undef NV_STORE_INIT_INLINE #undef NV_STORE_INLINE #undef NV_COPY_ARRRAY_INLINE BHND_NV_PANIC("unknown data type %d", itype); } /** * Initialize the internal representation of @p value with a buffer allocation * of @p len and @p itype, returning a pointer to the allocated buffer. * * If a buffer of @p len and @p itype can be represented inline, no * external buffer will be allocated, and instead a pointer to the inline * data representation will be returned. * * @param value The value to be initialized. * @param ilen The size of the external buffer to be allocated. * @param itype The type of the external buffer to be allocated. * @param flags Value flags. * * @retval non-null The newly allocated buffer. * @retval NULL If allocation failed. * @retval NULL If @p value is an externally allocated instance. */ static void * bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *ptr; BHND_NVRAM_VAL_ASSERT_EMPTY(value); /* Can we use inline storage? */ if (bhnd_nvram_val_set_inline(value, NULL, ilen, itype) == 0) { BHND_NV_ASSERT(sizeof(value->data) >= ilen, ("ilen exceeds inline storage")); value->data_type = itype; value->data_len = ilen; value->data_storage = BHND_NVRAM_VAL_DATA_INLINE; return (&value->data); } /* Is allocation permitted? */ if (!(flags & BHND_NVRAM_VAL_DYNAMIC)) return (NULL); /* Allocate external storage */ if ((ptr = bhnd_nv_malloc(ilen)) == NULL) return (NULL); value->data.ptr = ptr; value->data_len = ilen; value->data_type = itype; value->data_storage = BHND_NVRAM_VAL_DATA_EXT_ALLOC; return (ptr); } Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c =================================================================== --- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c (revision 350434) +++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c (revision 350435) @@ -1,882 +1,883 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #ifdef _KERNEL #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_valuevar.h" #ifdef _KERNEL #define bhnd_nv_hex2ascii(hex) hex2ascii(hex) #else /* !_KERNEL */ static char const bhnd_nv_hex2ascii[] = "0123456789abcdefghijklmnopqrstuvwxyz"; #define bhnd_nv_hex2ascii(hex) (bhnd_nv_hex2ascii[hex]) #endif /* _KERNEL */ /** * Maximum size, in bytes, of a string-encoded NVRAM integer value, not * including any prefix (0x, 0, etc). * * We assume the largest possible encoding is the base-2 representation * of a 64-bit integer. */ #define NV_NUMSTR_MAX ((sizeof(uint64_t) * CHAR_BIT) + 1) /** * Format a string representation of @p value using @p fmt, with, writing the * result to @p outp. * * @param value The value to be formatted. * @param fmt The format string. * @param[out] outp On success, the string will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual number of bytes required for the * requested string encoding (including a trailing * NUL). * * Refer to bhnd_nvram_val_vprintf() for full format string documentation. * * @retval 0 success * @retval EINVAL If @p fmt contains unrecognized format string * specifiers. * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to a single string * value via @p fmt is unsupported. * @retval ERANGE If value coercion of @p value would overflow (or * underflow) the representation defined by @p fmt. */ int bhnd_nvram_val_printf(bhnd_nvram_val *value, const char *fmt, char *outp, size_t *olen, ...) { va_list ap; int error; va_start(ap, olen); error = bhnd_nvram_val_vprintf(value, fmt, outp, olen, ap); va_end(ap); return (error); } /** * Format a string representation of the elements of @p value using @p fmt, * writing the result to @p outp. * * @param value The value to be formatted. * @param fmt The format string. * @param[out] outp On success, the string will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual number of bytes required for the * requested string encoding (including a trailing * NUL). * @param ap Argument list. * * @par Format Strings * * Value format strings are similar, but not identical to, those used * by printf(3). * * Format specifier format: * %[repeat][flags][width][.precision][length modifier][specifier] * * The format specifier is interpreted as an encoding directive for an * individual value element; each format specifier will fetch the next element * from the value, encode the element as the appropriate type based on the * length modifiers and specifier, and then format the result as a string. * * For example, given a string value of '0x000F', and a format specifier of * '%#hhx', the value will be asked to encode its first element as * BHND_NVRAM_TYPE_UINT8. String formatting will then be applied to the 8-bit * unsigned integer representation, producing a string value of "0xF". * * Repeat: * - [digits] Repeatedly apply the format specifier to the input * value's elements up to `digits` times. The delimiter * must be passed as a string in the next variadic * argument. * - [] Repeatedly apply the format specifier to the input * value's elements until all elements have been. The * processed. The delimiter must be passed as a string in * the next variadic argument. * - [*] Repeatedly apply the format specifier to the input * value's elements. The repeat count is read from the * next variadic argument as a size_t value * * Flags: * - '#' use alternative form (e.g. 0x/0X prefixing of hex * strings). * - '0' zero padding * - '-' left adjust padding * - '+' include a sign character * - ' ' include a space in place of a sign character for * positive numbers. * * Width/Precision: * - digits minimum field width. * - * read the minimum field width from the next variadic * argument as a ssize_t value. A negative value enables * left adjustment. * - .digits field precision. * - .* read the field precision from the next variadic argument * as a ssize_t value. A negative value enables left * adjustment. * * Length Modifiers: * - 'hh', 'I8' Convert the value to an 8-bit signed or unsigned * integer. * - 'h', 'I16' Convert the value to an 16-bit signed or unsigned * integer. * - 'l', 'I32' Convert the value to an 32-bit signed or unsigned * integer. * - 'll', 'j', 'I64' Convert the value to an 64-bit signed or unsigned * integer. * * Data Specifiers: * - 'd', 'i' Convert and format as a signed decimal integer. * - 'u' Convert and format as an unsigned decimal integer. * - 'o' Convert and format as an unsigned octal integer. * - 'x' Convert and format as an unsigned hexadecimal integer, * using lowercase hex digits. * - 'X' Convert and format as an unsigned hexadecimal integer, * using uppercase hex digits. * - 's' Convert and format as a string. * - '%' Print a literal '%' character. * * @retval 0 success * @retval EINVAL If @p fmt contains unrecognized format string * specifiers. * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to a single string * value via @p fmt is unsupported. * @retval ERANGE If value coercion of @p value would overflow (or * underflow) the representation defined by @p fmt. */ int bhnd_nvram_val_vprintf(bhnd_nvram_val *value, const char *fmt, char *outp, size_t *olen, va_list ap) { const void *elem; size_t elen; size_t limit, nbytes; int error; elem = NULL; /* Determine output byte limit */ nbytes = 0; if (outp != NULL) limit = *olen; else limit = 0; #define WRITE_CHAR(_c) do { \ if (limit > nbytes) \ *(outp + nbytes) = _c; \ \ if (nbytes == SIZE_MAX) \ return (EFTYPE); \ nbytes++; \ } while (0) /* Encode string value as per the format string */ for (const char *p = fmt; *p != '\0'; p++) { const char *delim; size_t precision, width, delim_len; u_long repeat, bits; bool alt_form, ladjust, have_precision; char padc, signc, lenc; padc = ' '; signc = '\0'; lenc = '\0'; delim = ""; delim_len = 0; ladjust = false; alt_form = false; have_precision = false; precision = 1; bits = 32; width = 0; repeat = 1; /* Copy all input to output until we hit a format specifier */ if (*p != '%') { WRITE_CHAR(*p); continue; } /* Hit '%' -- is this followed by an escaped '%' literal? */ p++; if (*p == '%') { WRITE_CHAR('%'); p++; continue; } /* Parse repeat specifier */ if (*p == '[') { p++; /* Determine repeat count */ if (*p == ']') { /* Repeat consumes all input */ repeat = bhnd_nvram_val_nelem(value); } else if (*p == '*') { /* Repeat is supplied as an argument */ repeat = va_arg(ap, size_t); p++; } else { char *endp; /* Repeat specified as argument */ repeat = strtoul(p, &endp, 10); if (p == endp) { BHND_NV_LOG("error parsing repeat " "count at '%s'", p); return (EINVAL); } /* Advance past repeat count */ p = endp; } /* Advance past terminating ']' */ if (*p != ']') { BHND_NV_LOG("error parsing repeat count at " "'%s'", p); return (EINVAL); } p++; delim = va_arg(ap, const char *); delim_len = strlen(delim); } /* Parse flags */ while (*p != '\0') { const char *np; bool stop; stop = false; np = p+1; switch (*p) { case '#': alt_form = true; break; case '0': padc = '0'; break; case '-': ladjust = true; break; case ' ': /* Must not override '+' */ if (signc != '+') signc = ' '; break; case '+': signc = '+'; break; default: /* Non-flag character */ stop = true; break; } if (stop) break; else p = np; } /* Parse minimum width */ if (*p == '*') { ssize_t arg; /* Width is supplied as an argument */ arg = va_arg(ap, int); /* Negative width argument is interpreted as * '-' flag followed by positive width */ if (arg < 0) { ladjust = true; arg = -arg; } width = arg; p++; } else if (bhnd_nv_isdigit(*p)) { uint32_t v; size_t len, parsed; /* Parse width value */ len = sizeof(v); error = bhnd_nvram_parse_int(p, strlen(p), 10, &parsed, &v, &len, BHND_NVRAM_TYPE_UINT32); if (error) { BHND_NV_LOG("error parsing width %s: %d\n", p, error); return (EINVAL); } /* Save width and advance input */ width = v; p += parsed; } /* Parse precision */ if (*p == '.') { uint32_t v; size_t len, parsed; p++; have_precision = true; if (*p == '*') { ssize_t arg; /* Precision is specified as an argument */ arg = va_arg(ap, int); /* Negative precision argument is interpreted * as '-' flag followed by positive * precision */ if (arg < 0) { ladjust = true; arg = -arg; } precision = arg; } else if (!bhnd_nv_isdigit(*p)) { /* Implicit precision of 0 */ precision = 0; } else { /* Parse precision value */ len = sizeof(v); error = bhnd_nvram_parse_int(p, strlen(p), 10, &parsed, &v, &len, BHND_NVRAM_TYPE_UINT32); if (error) { BHND_NV_LOG("error parsing width %s: " "%d\n", p, error); return (EINVAL); } /* Save precision and advance input */ precision = v; p += parsed; } } /* Parse length modifiers */ while (*p != '\0') { const char *np; bool stop; stop = false; np = p+1; switch (*p) { case 'h': if (lenc == '\0') { /* Set initial length value */ lenc = *p; bits = 16; } else if (lenc == *p && bits == 16) { /* Modify previous length value */ bits = 8; } else { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } break; case 'l': if (lenc == '\0') { /* Set initial length value */ lenc = *p; bits = 32; } else if (lenc == *p && bits == 32) { /* Modify previous length value */ bits = 64; } else { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } break; case 'j': /* Conflicts with all other length * specifications, and may only occur once */ if (lenc != '\0') { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } lenc = *p; bits = 64; break; case 'I': { char *endp; /* Conflicts with all other length * specifications, and may only occur once */ if (lenc != '\0') { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } lenc = *p; /* Parse the length specifier value */ p++; bits = strtoul(p, &endp, 10); if (p == endp) { BHND_NV_LOG("invalid size specifier: " "%s\n", p); return (EINVAL); } /* Advance input past the parsed integer */ np = endp; break; } default: /* Non-length modifier character */ stop = true; break; } if (stop) break; else p = np; } /* Parse conversion specifier and format the value(s) */ for (u_long n = 0; n < repeat; n++) { bhnd_nvram_type arg_type; size_t arg_size; size_t i; u_long base; bool is_signed, is_upper; is_signed = false; is_upper = false; base = 0; /* Fetch next element */ elem = bhnd_nvram_val_next(value, elem, &elen); if (elem == NULL) { BHND_NV_LOG("format string references more " "than %zu available value elements\n", bhnd_nvram_val_nelem(value)); return (EINVAL); } /* * If this is not the first value, append the delimiter. */ if (n > 0) { size_t nremain = 0; if (limit > nbytes) nremain = limit - nbytes; if (nremain >= delim_len) memcpy(outp + nbytes, delim, delim_len); /* Add delimiter length to the total byte count */ if (SIZE_MAX - nbytes < delim_len) return (EFTYPE); /* overflows size_t */ nbytes += delim_len; } /* Parse integer conversion specifiers */ switch (*p) { case 'd': case 'i': base = 10; is_signed = true; break; case 'u': base = 10; break; case 'o': base = 8; break; case 'x': base = 16; break; case 'X': base = 16; is_upper = true; break; } /* Format argument */ switch (*p) { #define NV_ENCODE_INT(_width) do { \ arg_type = (is_signed) ? BHND_NVRAM_TYPE_INT ## _width : \ BHND_NVRAM_TYPE_UINT ## _width; \ arg_size = sizeof(v.u ## _width); \ error = bhnd_nvram_val_encode_elem(value, elem, elen, \ &v.u ## _width, &arg_size, arg_type); \ if (error) { \ BHND_NV_LOG("error encoding argument as %s: %d\n", \ bhnd_nvram_type_name(arg_type), error); \ return (error); \ } \ \ if (is_signed) { \ if (v.i ## _width < 0) { \ add_neg = true; \ numval = (int64_t)-(v.i ## _width); \ } else { \ numval = (int64_t) (v.i ## _width); \ } \ } else { \ numval = v.u ## _width; \ } \ } while(0) case 'd': case 'i': case 'u': case 'o': case 'x': case 'X': { char numbuf[NV_NUMSTR_MAX]; char *sptr; uint64_t numval; size_t slen; bool add_neg; union { uint8_t u8; uint16_t u16; uint32_t u32; uint64_t u64; int8_t i8; int16_t i16; int32_t i32; int64_t i64; } v; add_neg = false; /* If precision is specified, it overrides * (and behaves identically) to a zero-prefixed * minimum width */ if (have_precision) { padc = '0'; width = precision; ladjust = false; } /* If zero-padding is used, value must be right * adjusted */ if (padc == '0') ladjust = false; /* Request encode to the appropriate integer * type, and then promote to common 64-bit * representation */ switch (bits) { case 8: NV_ENCODE_INT(8); break; case 16: NV_ENCODE_INT(16); break; case 32: NV_ENCODE_INT(32); break; case 64: NV_ENCODE_INT(64); break; default: BHND_NV_LOG("invalid length specifier: " "%lu\n", bits); return (EINVAL); } #undef NV_ENCODE_INT /* If a precision of 0 is specified and the * value is also zero, no characters should * be produced */ if (have_precision && precision == 0 && numval == 0) { break; } /* Emit string representation to local buffer */ BHND_NV_ASSERT(base <= 16, ("invalid base")); sptr = numbuf + nitems(numbuf) - 1; for (slen = 0; slen < sizeof(numbuf); slen++) { char c; uint64_t n; n = numval % base; c = bhnd_nv_hex2ascii(n); if (is_upper) c = bhnd_nv_toupper(c); sptr--; *sptr = c; numval /= (uint64_t)base; if (numval == 0) { slen++; break; } } arg_size = slen; /* Reserve space for 0/0x prefix? */ if (alt_form) { if (numval == 0) { /* If 0, no prefix */ alt_form = false; } else if (base == 8) { arg_size += 1; /* 0 */ } else if (base == 16) { arg_size += 2; /* 0x/0X */ } } /* Reserve space for ' ', '+', or '-' prefix? */ if (add_neg || signc != '\0') { if (add_neg) signc = '-'; arg_size++; } /* Right adjust (if using spaces) */ if (!ladjust && padc != '0') { for (i = arg_size; i < width; i++) WRITE_CHAR(padc); } if (signc != '\0') WRITE_CHAR(signc); if (alt_form) { if (base == 8) { WRITE_CHAR('0'); } else if (base == 16) { WRITE_CHAR('0'); if (is_upper) WRITE_CHAR('X'); else WRITE_CHAR('x'); } } /* Right adjust (if using zeros) */ if (!ladjust && padc == '0') { for (i = slen; i < width; i++) WRITE_CHAR(padc); } /* Write the string to our output buffer */ if (limit > nbytes && limit - nbytes >= slen) memcpy(outp + nbytes, sptr, slen); /* Update the total byte count */ if (SIZE_MAX - nbytes < arg_size) return (EFTYPE); /* overflows size_t */ nbytes += arg_size; /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } case 's': { char *s; size_t slen; /* Query the total length of the element when * converted to a string */ arg_type = BHND_NVRAM_TYPE_STRING; error = bhnd_nvram_val_encode_elem(value, elem, elen, NULL, &arg_size, arg_type); if (error) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } /* Do not include trailing NUL in the string * length */ if (arg_size > 0) arg_size--; /* Right adjust */ for (i = arg_size; !ladjust && i < width; i++) WRITE_CHAR(padc); /* Determine output positition and remaining * buffer space */ if (limit > nbytes) { s = outp + nbytes; slen = limit - nbytes; } else { s = NULL; slen = 0; } /* Encode the string to our output buffer */ error = bhnd_nvram_val_encode_elem(value, elem, elen, s, &slen, arg_type); if (error && error != ENOMEM) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } /* Update the total byte count */ if (SIZE_MAX - nbytes < arg_size) return (EFTYPE); /* overflows size_t */ nbytes += arg_size; /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } case 'c': { char c; arg_type = BHND_NVRAM_TYPE_CHAR; arg_size = bhnd_nvram_type_width(arg_type); /* Encode as single character */ error = bhnd_nvram_val_encode_elem(value, elem, elen, &c, &arg_size, arg_type); if (error) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } BHND_NV_ASSERT(arg_size == sizeof(c), ("invalid encoded size")); /* Right adjust */ for (i = arg_size; !ladjust && i < width; i++) WRITE_CHAR(padc); WRITE_CHAR(padc); /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } } } } /* Append terminating NUL */ if (limit > nbytes) *(outp + nbytes) = '\0'; if (nbytes < SIZE_MAX) nbytes++; else return (EFTYPE); /* Report required space */ *olen = nbytes; if (limit < nbytes) { if (outp != NULL) return (ENOMEM); } return (0); } Index: projects/fuse2/sys/dev/drm2/drmP.h =================================================================== --- projects/fuse2/sys/dev/drm2/drmP.h (revision 350434) +++ projects/fuse2/sys/dev/drm2/drmP.h (revision 350435) @@ -1,1955 +1,1956 @@ /** * \file drmP.h * Private header for Direct Rendering Manager * * \author Rickard E. (Rik) Faith * \author Gareth Hughes */ /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #ifndef _DRM_P_H_ #define _DRM_P_H_ #if defined(_KERNEL) || defined(__KERNEL__) #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE)) #define __OS_HAS_AGP 1 #else #define __OS_HAS_AGP 0 #endif #if defined(CONFIG_MTRR) #define __OS_HAS_MTRR 1 #else #define __OS_HAS_MTRR 0 #endif struct drm_file; struct drm_device; #include #include #include "opt_drm.h" #include "opt_syscons.h" #ifdef DRM_DEBUG #undef DRM_DEBUG #define DRM_DEBUG_DEFAULT_ON 1 #endif /* DRM_DEBUG */ #define DRM_DEBUGBITS_DEBUG 0x1 #define DRM_DEBUGBITS_KMS 0x2 #define DRM_DEBUGBITS_FAILED_IOCTL 0x4 #undef DRM_LINUX #define DRM_LINUX 0 /***********************************************************************/ /** \name DRM template customization defaults */ /*@{*/ /* driver capabilities and requirements mask */ #define DRIVER_USE_AGP 0x1 #define DRIVER_REQUIRE_AGP 0x2 #define DRIVER_USE_MTRR 0x4 #define DRIVER_PCI_DMA 0x8 #define DRIVER_SG 0x10 #define DRIVER_HAVE_DMA 0x20 #define DRIVER_HAVE_IRQ 0x40 #define DRIVER_IRQ_SHARED 0x80 #define DRIVER_IRQ_VBL 0x100 #define DRIVER_DMA_QUEUE 0x200 #define DRIVER_FB_DMA 0x400 #define DRIVER_IRQ_VBL2 0x800 #define DRIVER_GEM 0x1000 #define DRIVER_MODESET 0x2000 #define DRIVER_PRIME 0x4000 #define DRIVER_BUS_PCI 0x1 #define DRIVER_BUS_PLATFORM 0x2 #define DRIVER_BUS_USB 0x3 /***********************************************************************/ /** \name Begin the DRM... */ /*@{*/ #define DRM_DEBUG_CODE 2 /**< Include debugging code if > 1, then also include looping detection. */ #define DRM_MAGIC_HASH_ORDER 4 /**< Size of key hash table. Must be power of 2. */ #define DRM_KERNEL_CONTEXT 0 /**< Change drm_resctx if changed */ #define DRM_RESERVED_CONTEXTS 1 /**< Change drm_resctx if changed */ #define DRM_LOOPING_LIMIT 5000000 #define DRM_TIME_SLICE (HZ/20) /**< Time slice for GLXContexts */ #define DRM_LOCK_SLICE 1 /**< Time slice for lock, in jiffies */ #define DRM_FLAG_DEBUG 0x01 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8) #define DRM_MAP_HASH_OFFSET 0x10000000 /*@}*/ /***********************************************************************/ /** \name Macros to make printk easier */ /*@{*/ /** * Error output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_ERROR(fmt, ...) \ printf("error: [" DRM_NAME ":pid%d:%s] *ERROR* " fmt, \ DRM_CURRENTPID, __func__ , ##__VA_ARGS__) #define DRM_WARNING(fmt, ...) printf("warning: [" DRM_NAME "] " fmt , ##__VA_ARGS__) #define DRM_INFO(fmt, ...) printf("info: [" DRM_NAME "] " fmt , ##__VA_ARGS__) /** * Debug output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEBUG(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_DEBUG) != 0) \ printf("[" DRM_NAME ":pid%d:%s] " fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_DEBUG_DRIVER(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_DEBUG_KMS(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_KMS(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_MODE(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_DRIVER(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) /*@}*/ /***********************************************************************/ /** \name Internal types and structures */ /*@{*/ #define DRM_ARRAY_SIZE(x) ARRAY_SIZE(x) #define DRM_LEFTCOUNT(x) (((x)->rp + (x)->count - (x)->wp) % ((x)->count + 1)) #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x)) #define DRM_IF_VERSION(maj, min) (maj << 16 | min) /** * Test that the hardware lock is held by the caller, returning otherwise. * * \param dev DRM device. * \param filp file pointer of the caller. */ #define LOCK_TEST_WITH_RETURN( dev, _file_priv ) \ do { \ if (!_DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock) || \ _file_priv->master->lock.file_priv != _file_priv) { \ DRM_ERROR( "%s called without lock held, held %d owner %p %p\n",\ __func__, _DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock),\ _file_priv->master->lock.file_priv, _file_priv); \ return -EINVAL; \ } \ } while (0) /** * Ioctl function type. * * \param inode device inode. * \param file_priv DRM file private pointer. * \param cmd command. * \param arg argument. */ typedef int drm_ioctl_t(struct drm_device *dev, void *data, struct drm_file *file_priv); #define DRM_IOCTL_NR(n) ((n) & 0xff) #define DRM_MAJOR 226 #define DRM_AUTH 0x1 #define DRM_MASTER 0x2 #define DRM_ROOT_ONLY 0x4 #define DRM_CONTROL_ALLOW 0x8 #define DRM_UNLOCKED 0x10 struct drm_ioctl_desc { unsigned long cmd; int flags; drm_ioctl_t *func; unsigned int cmd_drv; }; /** * Creates a driver or general drm_ioctl_desc array entry for the given * ioctl, for use by drm_ioctl(). */ #define DRM_IOCTL_DEF(ioctl, _func, _flags) \ [DRM_IOCTL_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0} #define DRM_IOCTL_DEF_DRV(ioctl, _func, _flags) \ [DRM_IOCTL_NR(DRM_##ioctl)] = {.cmd = DRM_##ioctl, .func = _func, .flags = _flags, .cmd_drv = DRM_IOCTL_##ioctl} struct drm_magic_entry { struct list_head head; struct drm_hash_item hash_item; struct drm_file *priv; }; /** * DMA buffer. */ struct drm_buf { int idx; /**< Index into master buflist */ int total; /**< Buffer size */ int order; /**< log-base-2(total) */ int used; /**< Amount of buffer in use (for DMA) */ unsigned long offset; /**< Byte offset (used internally) */ void *address; /**< Address of buffer */ unsigned long bus_address; /**< Bus address of buffer */ struct drm_buf *next; /**< Kernel-only: used for free list */ __volatile__ int waiting; /**< On kernel DMA queue */ __volatile__ int pending; /**< On hardware DMA queue */ struct drm_file *file_priv; /**< Private of holding file descr */ int context; /**< Kernel queue for this buffer */ int while_locked; /**< Dispatch this buffer while locked */ enum { DRM_LIST_NONE = 0, DRM_LIST_FREE = 1, DRM_LIST_WAIT = 2, DRM_LIST_PEND = 3, DRM_LIST_PRIO = 4, DRM_LIST_RECLAIM = 5 } list; /**< Which list we're on */ int dev_priv_size; /**< Size of buffer private storage */ void *dev_private; /**< Per-buffer private storage */ }; struct drm_freelist { int initialized; /**< Freelist in use */ atomic_t count; /**< Number of free buffers */ struct drm_buf *next; /**< End pointer */ #ifdef FREEBSD_NOTYET wait_queue_head_t waiting; /**< Processes waiting on free bufs */ #endif /* defined(FREEBSD_NOTYET) */ int low_mark; /**< Low water mark */ int high_mark; /**< High water mark */ #ifdef FREEBSD_NOTYET atomic_t wfh; /**< If waiting for high mark */ spinlock_t lock; #endif /* defined(FREEBSD_NOTYET) */ }; typedef struct drm_dma_handle { void *vaddr; bus_addr_t busaddr; bus_dma_tag_t tag; bus_dmamap_t map; } drm_dma_handle_t; /** * Buffer entry. There is one of this for each buffer size order. */ struct drm_buf_entry { int buf_size; /**< size */ int buf_count; /**< number of buffers */ struct drm_buf *buflist; /**< buffer list */ int seg_count; int page_order; struct drm_dma_handle **seglist; struct drm_freelist freelist; }; /* Event queued up for userspace to read */ struct drm_pending_event { struct drm_event *event; struct list_head link; struct drm_file *file_priv; pid_t pid; /* pid of requester, no guarantee it's valid by the time we deliver the event, for tracing only */ void (*destroy)(struct drm_pending_event *event); }; /* initial implementaton using a linked list - todo hashtab */ struct drm_prime_file_private { struct list_head head; struct mtx lock; }; struct drm_file { int authenticated; pid_t pid; uid_t uid; drm_magic_t magic; unsigned long ioctl_count; struct list_head lhead; struct drm_minor *minor; unsigned long lock_count; void *driver_priv; struct drm_gem_names object_names; int is_master; /* this file private is a master for a minor */ struct drm_master *master; /* master this node is currently associated with N.B. not always minor->master */ struct list_head fbs; struct selinfo event_poll; struct list_head event_list; int event_space; struct drm_prime_file_private prime; }; /** * Lock data. */ struct drm_lock_data { struct drm_hw_lock *hw_lock; /**< Hardware lock */ /** Private of lock holder's file (NULL=kernel) */ struct drm_file *file_priv; wait_queue_head_t lock_queue; /**< Queue of blocked processes */ unsigned long lock_time; /**< Time of last lock in jiffies */ struct mtx spinlock; uint32_t kernel_waiters; uint32_t user_waiters; int idle_has_lock; }; /** * DMA data. */ struct drm_device_dma { struct drm_buf_entry bufs[DRM_MAX_ORDER + 1]; /**< buffers, grouped by their size order */ int buf_count; /**< total number of buffers */ struct drm_buf **buflist; /**< Vector of pointers into drm_device_dma::bufs */ int seg_count; int page_count; /**< number of pages */ unsigned long *pagelist; /**< page list */ unsigned long byte_count; enum { _DRM_DMA_USE_AGP = 0x01, _DRM_DMA_USE_SG = 0x02, _DRM_DMA_USE_FB = 0x04, _DRM_DMA_USE_PCI_RO = 0x08 } flags; }; /** * AGP memory entry. Stored as a doubly linked list. */ struct drm_agp_mem { unsigned long handle; /**< handle */ DRM_AGP_MEM *memory; unsigned long bound; /**< address */ int pages; struct list_head head; }; /** * AGP data. * * \sa drm_agp_init() and drm_device::agp. */ struct drm_agp_head { DRM_AGP_KERN agp_info; /**< AGP device information */ struct list_head memory; unsigned long mode; /**< AGP mode */ device_t bridge; int enabled; /**< whether the AGP bus as been enabled */ int acquired; /**< whether the AGP device has been acquired */ unsigned long base; int agp_mtrr; int cant_use_aperture; }; /** * Scatter-gather memory. */ struct drm_sg_mem { vm_offset_t vaddr; vm_paddr_t *busaddr; vm_pindex_t pages; }; struct drm_sigdata { int context; struct drm_hw_lock *lock; }; /** * Kernel side of a mapping */ #define DRM_MAP_HANDLE_BITS (sizeof(void *) == 4 ? 4 : 24) #define DRM_MAP_HANDLE_SHIFT (sizeof(void *) * 8 - DRM_MAP_HANDLE_BITS) struct drm_local_map { resource_size_t offset; /**< Requested physical address (0 for SAREA)*/ unsigned long size; /**< Requested physical size (bytes) */ enum drm_map_type type; /**< Type of memory to map */ enum drm_map_flags flags; /**< Flags */ void *handle; /**< User-space: "Handle" to pass to mmap() */ /**< Kernel-space: kernel-virtual address */ int mtrr; /**< MTRR slot used */ /* Private data */ drm_dma_handle_t *dmah; }; typedef struct drm_local_map drm_local_map_t; /** * Mappings list */ struct drm_map_list { struct list_head head; /**< list head */ struct drm_hash_item hash; struct drm_local_map *map; /**< mapping */ uint64_t user_token; struct drm_master *master; struct drm_mm_node *file_offset_node; /**< fake offset */ }; /** * Context handle list */ struct drm_ctx_list { struct list_head head; /**< list head */ drm_context_t handle; /**< context handle */ struct drm_file *tag; /**< associated fd private data */ }; /* location of GART table */ #define DRM_ATI_GART_MAIN 1 #define DRM_ATI_GART_FB 2 #define DRM_ATI_GART_PCI 1 #define DRM_ATI_GART_PCIE 2 #define DRM_ATI_GART_IGP 3 struct drm_ati_pcigart_info { int gart_table_location; int gart_reg_if; void *addr; dma_addr_t bus_addr; dma_addr_t table_mask; struct drm_dma_handle *table_handle; struct drm_local_map mapping; int table_size; struct drm_dma_handle *dmah; /* handle for ATI PCIGART table FIXME */ }; /** * GEM specific mm private for tracking GEM objects */ struct drm_gem_mm { struct unrhdr *idxunr; struct drm_open_hash offset_hash; /**< User token hash table for maps */ }; /** * This structure defines the drm_mm memory object, which will be used by the * DRM for its buffer objects. */ struct drm_gem_object { /** Reference count of this object */ u_int refcount; /** Handle count of this object. Each handle also holds a reference */ atomic_t handle_count; /* number of handles on this object */ /** Related drm device */ struct drm_device *dev; /** File representing the shmem storage: filp in Linux parlance */ vm_object_t vm_obj; /* Mapping info for this object */ bool on_map; struct drm_hash_item map_list; /** * Size of the object, in bytes. Immutable over the object's * lifetime. */ size_t size; /** * Global name for this object, starts at 1. 0 means unnamed. * Access is covered by the object_name_lock in the related drm_device */ int name; /** * Memory domains. These monitor which caches contain read/write data * related to the object. When transitioning from one set of domains * to another, the driver is called to ensure that caches are suitably * flushed and invalidated */ uint32_t read_domains; uint32_t write_domain; /** * While validating an exec operation, the * new read/write domain values are computed here. * They will be transferred to the above values * at the point that any cache flushing occurs */ uint32_t pending_read_domains; uint32_t pending_write_domain; void *driver_private; #ifdef FREEBSD_NOTYET /* dma buf exported from this GEM object */ struct dma_buf *export_dma_buf; /* dma buf attachment backing this object */ struct dma_buf_attachment *import_attach; #endif /* FREEBSD_NOTYET */ }; #include /* per-master structure */ struct drm_master { u_int refcount; /* refcount for this master */ struct list_head head; /**< each minor contains a list of masters */ struct drm_minor *minor; /**< link back to minor we are a master for */ char *unique; /**< Unique identifier: e.g., busid */ int unique_len; /**< Length of unique field */ int unique_size; /**< amount allocated */ int blocked; /**< Blocked due to VC switch? */ /** \name Authentication */ /*@{ */ struct drm_open_hash magiclist; struct list_head magicfree; /*@} */ struct drm_lock_data lock; /**< Information on hardware lock */ void *driver_priv; /**< Private structure for driver to use */ }; /* Size of ringbuffer for vblank timestamps. Just double-buffer * in initial implementation. */ #define DRM_VBLANKTIME_RBSIZE 2 /* Flags and return codes for get_vblank_timestamp() driver function. */ #define DRM_CALLED_FROM_VBLIRQ 1 #define DRM_VBLANKTIME_SCANOUTPOS_METHOD (1 << 0) #define DRM_VBLANKTIME_INVBL (1 << 1) /* get_scanout_position() return flags */ #define DRM_SCANOUTPOS_VALID (1 << 0) #define DRM_SCANOUTPOS_INVBL (1 << 1) #define DRM_SCANOUTPOS_ACCURATE (1 << 2) struct drm_bus { int bus_type; int (*get_irq)(struct drm_device *dev); void (*free_irq)(struct drm_device *dev); const char *(*get_name)(struct drm_device *dev); int (*set_busid)(struct drm_device *dev, struct drm_master *master); int (*set_unique)(struct drm_device *dev, struct drm_master *master, struct drm_unique *unique); int (*irq_by_busid)(struct drm_device *dev, struct drm_irq_busid *p); /* hooks that are for PCI */ int (*agp_init)(struct drm_device *dev); }; /** * DRM driver structure. This structure represent the common code for * a family of cards. There will one drm_device for each card present * in this family */ struct drm_driver { int (*load) (struct drm_device *, unsigned long flags); int (*firstopen) (struct drm_device *); int (*open) (struct drm_device *, struct drm_file *); void (*preclose) (struct drm_device *, struct drm_file *file_priv); void (*postclose) (struct drm_device *, struct drm_file *); void (*lastclose) (struct drm_device *); int (*unload) (struct drm_device *); int (*suspend) (struct drm_device *, pm_message_t state); int (*resume) (struct drm_device *); int (*dma_ioctl) (struct drm_device *dev, void *data, struct drm_file *file_priv); int (*dma_quiescent) (struct drm_device *); int (*context_dtor) (struct drm_device *dev, int context); /** * get_vblank_counter - get raw hardware vblank counter * @dev: DRM device * @crtc: counter to fetch * * Driver callback for fetching a raw hardware vblank counter for @crtc. * If a device doesn't have a hardware counter, the driver can simply * return the value of drm_vblank_count. The DRM core will account for * missed vblank events while interrupts where disabled based on system * timestamps. * * Wraparound handling and loss of events due to modesetting is dealt * with in the DRM core code. * * RETURNS * Raw vblank counter value. */ u32 (*get_vblank_counter) (struct drm_device *dev, int crtc); /** * enable_vblank - enable vblank interrupt events * @dev: DRM device * @crtc: which irq to enable * * Enable vblank interrupts for @crtc. If the device doesn't have * a hardware vblank counter, this routine should be a no-op, since * interrupts will have to stay on to keep the count accurate. * * RETURNS * Zero on success, appropriate errno if the given @crtc's vblank * interrupt cannot be enabled. */ int (*enable_vblank) (struct drm_device *dev, int crtc); /** * disable_vblank - disable vblank interrupt events * @dev: DRM device * @crtc: which irq to enable * * Disable vblank interrupts for @crtc. If the device doesn't have * a hardware vblank counter, this routine should be a no-op, since * interrupts will have to stay on to keep the count accurate. */ void (*disable_vblank) (struct drm_device *dev, int crtc); /** * Called by \c drm_device_is_agp. Typically used to determine if a * card is really attached to AGP or not. * * \param dev DRM device handle * * \returns * One of three values is returned depending on whether or not the * card is absolutely \b not AGP (return of 0), absolutely \b is AGP * (return of 1), or may or may not be AGP (return of 2). */ int (*device_is_agp) (struct drm_device *dev); /** * Called by vblank timestamping code. * * Return the current display scanout position from a crtc. * * \param dev DRM device. * \param crtc Id of the crtc to query. * \param *vpos Target location for current vertical scanout position. * \param *hpos Target location for current horizontal scanout position. * * Returns vpos as a positive number while in active scanout area. * Returns vpos as a negative number inside vblank, counting the number * of scanlines to go until end of vblank, e.g., -1 means "one scanline * until start of active scanout / end of vblank." * * \return Flags, or'ed together as follows: * * DRM_SCANOUTPOS_VALID = Query successful. * DRM_SCANOUTPOS_INVBL = Inside vblank. * DRM_SCANOUTPOS_ACCURATE = Returned position is accurate. A lack of * this flag means that returned position may be offset by a constant * but unknown small number of scanlines wrt. real scanout position. * */ int (*get_scanout_position) (struct drm_device *dev, int crtc, int *vpos, int *hpos); /** * Called by \c drm_get_last_vbltimestamp. Should return a precise * timestamp when the most recent VBLANK interval ended or will end. * * Specifically, the timestamp in @vblank_time should correspond as * closely as possible to the time when the first video scanline of * the video frame after the end of VBLANK will start scanning out, * the time immediately after end of the VBLANK interval. If the * @crtc is currently inside VBLANK, this will be a time in the future. * If the @crtc is currently scanning out a frame, this will be the * past start time of the current scanout. This is meant to adhere * to the OpenML OML_sync_control extension specification. * * \param dev dev DRM device handle. * \param crtc crtc for which timestamp should be returned. * \param *max_error Maximum allowable timestamp error in nanoseconds. * Implementation should strive to provide timestamp * with an error of at most *max_error nanoseconds. * Returns true upper bound on error for timestamp. * \param *vblank_time Target location for returned vblank timestamp. * \param flags 0 = Defaults, no special treatment needed. * \param DRM_CALLED_FROM_VBLIRQ = Function is called from vblank * irq handler. Some drivers need to apply some workarounds * for gpu-specific vblank irq quirks if flag is set. * * \returns * Zero if timestamping isn't supported in current display mode or a * negative number on failure. A positive status code on success, * which describes how the vblank_time timestamp was computed. */ int (*get_vblank_timestamp) (struct drm_device *dev, int crtc, int *max_error, struct timeval *vblank_time, unsigned flags); /* these have to be filled in */ irqreturn_t(*irq_handler) (DRM_IRQ_ARGS); void (*irq_preinstall) (struct drm_device *dev); int (*irq_postinstall) (struct drm_device *dev); void (*irq_uninstall) (struct drm_device *dev); void (*set_version) (struct drm_device *dev, struct drm_set_version *sv); /* Master routines */ int (*master_create)(struct drm_device *dev, struct drm_master *master); void (*master_destroy)(struct drm_device *dev, struct drm_master *master); /** * master_set is called whenever the minor master is set. * master_drop is called whenever the minor master is dropped. */ int (*master_set)(struct drm_device *dev, struct drm_file *file_priv, bool from_open); void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv, bool from_release); /** * Driver-specific constructor for drm_gem_objects, to set up * obj->driver_private. * * Returns 0 on success. */ int (*gem_init_object) (struct drm_gem_object *obj); void (*gem_free_object) (struct drm_gem_object *obj); int (*gem_open_object) (struct drm_gem_object *, struct drm_file *); void (*gem_close_object) (struct drm_gem_object *, struct drm_file *); #ifdef FREEBSD_NOTYET /* prime: */ /* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */ int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); /* import fd -> handle (see drm_gem_prime_fd_to_handle() helper) */ int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); /* export GEM -> dmabuf */ struct dma_buf * (*gem_prime_export)(struct drm_device *dev, struct drm_gem_object *obj, int flags); /* import dmabuf -> GEM */ struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev, struct dma_buf *dma_buf); #endif /* defined(FREEBSD_NOTYET) */ /* dumb alloc support */ int (*dumb_create)(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args); int (*dumb_map_offset)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset); int (*dumb_destroy)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle); /* Driver private ops for this object */ struct cdev_pager_ops *gem_pager_ops; int (*sysctl_init)(struct drm_device *dev, struct sysctl_ctx_list *ctx, struct sysctl_oid *top); void (*sysctl_cleanup)(struct drm_device *dev); int major; int minor; int patchlevel; char *name; char *desc; char *date; u32 driver_features; int dev_priv_size; struct drm_ioctl_desc *ioctls; int num_ioctls; struct drm_bus *bus; #ifdef COMPAT_FREEBSD32 struct drm_ioctl_desc *compat_ioctls; int *num_compat_ioctls; #endif int buf_priv_size; }; #define DRM_MINOR_UNASSIGNED 0 #define DRM_MINOR_LEGACY 1 #define DRM_MINOR_CONTROL 2 #define DRM_MINOR_RENDER 3 /** * DRM minor structure. This structure represents a drm minor number. */ struct drm_minor { int index; /**< Minor device number */ int type; /**< Control or render */ struct cdev *device; /**< Device number for mknod */ device_t kdev; /**< OS device */ struct drm_device *dev; struct drm_master *master; /* currently active master for this node */ struct list_head master_list; struct drm_mode_group mode_group; struct sigio *buf_sigio; /* Processes waiting for SIGIO */ }; /* mode specified on the command line */ struct drm_cmdline_mode { bool specified; bool refresh_specified; bool bpp_specified; int xres, yres; int bpp; int refresh; bool rb; bool interlace; bool cvt; bool margins; enum drm_connector_force force; }; struct drm_pending_vblank_event { struct drm_pending_event base; int pipe; struct drm_event_vblank event; }; /** * DRM device structure. This structure represent a complete card that * may contain multiple heads. */ struct drm_device { int if_version; /**< Highest interface version set */ /** \name Locks */ /*@{ */ struct mtx count_lock; /**< For inuse, drm_device::open_count, drm_device::buf_use */ struct sx dev_struct_lock; /**< For others */ /*@} */ /** \name Usage Counters */ /*@{ */ int open_count; /**< Outstanding files open */ atomic_t ioctl_count; /**< Outstanding IOCTLs pending */ atomic_t vma_count; /**< Outstanding vma areas open */ int buf_use; /**< Buffers in use -- cannot alloc */ atomic_t buf_alloc; /**< Buffer allocation in progress */ /*@} */ /** \name Performance counters */ /*@{ */ unsigned long counters; enum drm_stat_type types[15]; atomic_t counts[15]; /*@} */ struct list_head filelist; /** \name Memory management */ /*@{ */ struct list_head maplist; /**< Linked list of regions */ int map_count; /**< Number of mappable regions */ struct drm_open_hash map_hash; /**< User token hash table for maps */ /** \name Context handle management */ /*@{ */ struct list_head ctxlist; /**< Linked list of context handles */ int ctx_count; /**< Number of context handles */ struct mtx ctxlist_mutex; /**< For ctxlist */ drm_local_map_t **context_sareas; int max_context; unsigned long *ctx_bitmap; /*@} */ /** \name DMA support */ /*@{ */ struct drm_device_dma *dma; /**< Optional pointer for DMA support */ /*@} */ /** \name Context support */ /*@{ */ int irq_enabled; /**< True if irq handler is enabled */ atomic_t context_flag; /**< Context swapping flag */ atomic_t interrupt_flag; /**< Interruption handler flag */ atomic_t dma_flag; /**< DMA dispatch flag */ wait_queue_head_t context_wait; /**< Processes waiting on ctx switch */ int last_checked; /**< Last context checked for DMA */ int last_context; /**< Last current context */ unsigned long last_switch; /**< jiffies at last context switch */ /*@} */ /** \name VBLANK IRQ support */ /*@{ */ /* * At load time, disabling the vblank interrupt won't be allowed since * old clients may not call the modeset ioctl and therefore misbehave. * Once the modeset ioctl *has* been called though, we can safely * disable them when unused. */ int vblank_disable_allowed; atomic_t *_vblank_count; /**< number of VBLANK interrupts (driver must alloc the right number of counters) */ struct timeval *_vblank_time; /**< timestamp of current vblank_count (drivers must alloc right number of fields) */ struct mtx vblank_time_lock; /**< Protects vblank count and time updates during vblank enable/disable */ struct mtx vbl_lock; atomic_t *vblank_refcount; /* number of users of vblank interruptsper crtc */ u32 *last_vblank; /* protected by dev->vbl_lock, used */ /* for wraparound handling */ int *vblank_enabled; /* so we don't call enable more than once per disable */ int *vblank_inmodeset; /* Display driver is setting mode */ u32 *last_vblank_wait; /* Last vblank seqno waited per CRTC */ struct callout vblank_disable_callout; u32 max_vblank_count; /**< size of vblank counter register */ /** * List of events */ struct list_head vblank_event_list; struct mtx event_lock; /*@} */ struct drm_agp_head *agp; /**< AGP data */ device_t dev; /* Device instance from newbus */ uint16_t pci_device; /* PCI device id */ uint16_t pci_vendor; /* PCI vendor id */ uint16_t pci_subdevice; /* PCI subsystem device id */ uint16_t pci_subvendor; /* PCI subsystem vendor id */ struct drm_sg_mem *sg; /**< Scatter gather memory */ unsigned int num_crtcs; /**< Number of CRTCs on this device */ void *dev_private; /**< device private data */ void *mm_private; struct drm_sigdata sigdata; /**< For block_all_signals */ sigset_t sigmask; struct drm_driver *driver; struct drm_local_map *agp_buffer_map; unsigned int agp_buffer_token; struct drm_minor *control; /**< Control node for card */ struct drm_minor *primary; /**< render type primary screen head */ struct drm_mode_config mode_config; /**< Current mode config */ /** \name GEM information */ /*@{ */ struct sx object_name_lock; struct drm_gem_names object_names; /*@} */ int switch_power_state; atomic_t unplugged; /* device has been unplugged or gone away */ /* Locks */ struct mtx dma_lock; /* protects dev->dma */ struct mtx irq_lock; /* protects irq condition checks */ /* Context support */ int irq; /* Interrupt used by board */ int msi_enabled; /* MSI enabled */ int irqrid; /* Interrupt used by board */ struct resource *irqr; /* Resource for interrupt used by board */ void *irqh; /* Handle from bus_setup_intr */ /* Storage of resource pointers for drm_get_resource_* */ #define DRM_MAX_PCI_RESOURCE 6 struct resource *pcir[DRM_MAX_PCI_RESOURCE]; int pcirid[DRM_MAX_PCI_RESOURCE]; struct mtx pcir_lock; int pci_domain; int pci_bus; int pci_slot; int pci_func; /* Sysctl support */ struct drm_sysctl_info *sysctl; int sysctl_node_idx; void *drm_ttm_bdev; void *sysctl_private; char busid_str[128]; int modesetting; const drm_pci_id_list_t *id_entry; /* PCI ID, name, and chipset private */ }; #define DRM_SWITCH_POWER_ON 0 #define DRM_SWITCH_POWER_OFF 1 #define DRM_SWITCH_POWER_CHANGING 2 static __inline__ int drm_core_check_feature(struct drm_device *dev, int feature) { return ((dev->driver->driver_features & feature) ? 1 : 0); } static inline int drm_dev_to_irq(struct drm_device *dev) { return dev->driver->bus->get_irq(dev); } #if __OS_HAS_AGP static inline int drm_core_has_AGP(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_USE_AGP); } #else #define drm_core_has_AGP(dev) (0) #endif #if __OS_HAS_MTRR static inline int drm_core_has_MTRR(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_USE_MTRR); } #define DRM_MTRR_WC MDF_WRITECOMBINE int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags); int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags); #else #define drm_core_has_MTRR(dev) (0) #define DRM_MTRR_WC 0 static inline int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags) { return 0; } static inline int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags) { return 0; } #endif /******************************************************************/ /** \name Internal function definitions */ /*@{*/ /* Driver support (drm_drv.h) */ d_ioctl_t drm_ioctl; extern int drm_lastclose(struct drm_device *dev); /* Device support (drm_fops.h) */ extern struct sx drm_global_mutex; d_open_t drm_open; d_read_t drm_read; extern void drm_release(void *data); /* Mapping support (drm_vm.h) */ d_mmap_t drm_mmap; int drm_mmap_single(struct cdev *kdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); d_poll_t drm_poll; /* Misc. IOCTL support (drm_ioctl.h) */ extern int drm_irq_by_busid(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getunique(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_setunique(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getmap(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getclient(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getstats(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_setversion(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_noop(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Context IOCTL support (drm_context.h) */ extern int drm_resctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_addctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_modctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_switchctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_newctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_rmctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_ctxbitmap_init(struct drm_device *dev); extern void drm_ctxbitmap_cleanup(struct drm_device *dev); extern void drm_ctxbitmap_free(struct drm_device *dev, int ctx_handle); extern int drm_setsareactx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getsareactx(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Authentication IOCTL support (drm_auth.h) */ extern int drm_getmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic); /* Cache management (drm_cache.c) */ void drm_clflush_pages(vm_page_t *pages, unsigned long num_pages); void drm_clflush_virt_range(char *addr, unsigned long length); /* Locking IOCTL support (drm_lock.h) */ extern int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_unlock(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_lock_free(struct drm_lock_data *lock_data, unsigned int context); extern void drm_idlelock_take(struct drm_lock_data *lock_data); extern void drm_idlelock_release(struct drm_lock_data *lock_data); /* * These are exported to drivers so that they can implement fencing using * DMA quiscent + idle. DMA quiescent usually requires the hardware lock. */ extern int drm_i_have_hw_lock(struct drm_device *dev, struct drm_file *file_priv); /* Buffer management support (drm_bufs.h) */ extern int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc * request); extern int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc * request); extern int drm_addmap(struct drm_device *dev, resource_size_t offset, unsigned int size, enum drm_map_type type, enum drm_map_flags flags, struct drm_local_map **map_ptr); extern int drm_addmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_rmmap(struct drm_device *dev, struct drm_local_map *map); extern int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map); extern int drm_rmmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_addbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_infobufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_markbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_freebufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_order(unsigned long size); /* DMA support (drm_dma.h) */ extern int drm_dma_setup(struct drm_device *dev); extern void drm_dma_takedown(struct drm_device *dev); extern void drm_free_buffer(struct drm_device *dev, struct drm_buf * buf); extern void drm_core_reclaim_buffers(struct drm_device *dev, struct drm_file *filp); /* IRQ support (drm_irq.h) */ extern int drm_control(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_irq_install(struct drm_device *dev); extern int drm_irq_uninstall(struct drm_device *dev); extern int drm_vblank_init(struct drm_device *dev, int num_crtcs); extern int drm_wait_vblank(struct drm_device *dev, void *data, struct drm_file *filp); extern int drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq); extern u32 drm_vblank_count(struct drm_device *dev, int crtc); extern u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc, struct timeval *vblanktime); extern void drm_send_vblank_event(struct drm_device *dev, int crtc, struct drm_pending_vblank_event *e); extern bool drm_handle_vblank(struct drm_device *dev, int crtc); extern int drm_vblank_get(struct drm_device *dev, int crtc); extern void drm_vblank_put(struct drm_device *dev, int crtc); extern void drm_vblank_off(struct drm_device *dev, int crtc); extern void drm_vblank_cleanup(struct drm_device *dev); extern u32 drm_get_last_vbltimestamp(struct drm_device *dev, int crtc, struct timeval *tvblank, unsigned flags); extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev, int crtc, int *max_error, struct timeval *vblank_time, unsigned flags, struct drm_crtc *refcrtc); extern void drm_calc_timestamping_constants(struct drm_crtc *crtc); extern bool drm_mode_parse_command_line_for_connector(const char *mode_option, struct drm_connector *connector, struct drm_cmdline_mode *mode); extern struct drm_display_mode * drm_mode_create_from_cmdline_mode(struct drm_device *dev, struct drm_cmdline_mode *cmd); /* Modesetting support */ extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc); extern void drm_vblank_post_modeset(struct drm_device *dev, int crtc); extern int drm_modeset_ctl(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Stub support (drm_stub.h) */ extern int drm_setmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); struct drm_master *drm_master_create(struct drm_minor *minor); extern struct drm_master *drm_master_get(struct drm_master *master); extern void drm_master_put(struct drm_master **master); extern void drm_put_dev(struct drm_device *dev); extern int drm_put_minor(struct drm_minor **minor); extern void drm_unplug_dev(struct drm_device *dev); extern unsigned int drm_debug; extern unsigned int drm_notyet; extern unsigned int drm_vblank_offdelay; extern unsigned int drm_timestamp_precision; extern unsigned int drm_timestamp_monotonic; extern struct drm_local_map *drm_getsarea(struct drm_device *dev); #ifdef FREEBSD_NOTYET extern int drm_gem_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); extern int drm_gem_prime_fd_to_handle(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); extern int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_prime_sg_to_page_addr_arrays(struct sg_table *sgt, vm_page_t *pages, dma_addr_t *addrs, int max_pages); extern struct sg_table *drm_prime_pages_to_sg(vm_page_t *pages, int nr_pages); extern void drm_prime_gem_destroy(struct drm_gem_object *obj, struct sg_table *sg); void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv); void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv); int drm_prime_add_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t handle); int drm_prime_lookup_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t *handle); void drm_prime_remove_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf); int drm_prime_add_dma_buf(struct drm_device *dev, struct drm_gem_object *obj); int drm_prime_lookup_obj(struct drm_device *dev, struct dma_buf *buf, struct drm_gem_object **obj); #endif /* FREEBSD_NOTYET */ /* Scatter Gather Support (drm_scatter.h) */ extern void drm_sg_cleanup(struct drm_sg_mem * entry); extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request); extern int drm_sg_free(struct drm_device *dev, void *data, struct drm_file *file_priv); /* ATI PCIGART support (ati_pcigart.h) */ extern int drm_ati_pcigart_init(struct drm_device *dev, struct drm_ati_pcigart_info * gart_info); extern int drm_ati_pcigart_cleanup(struct drm_device *dev, struct drm_ati_pcigart_info * gart_info); extern drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size, size_t align, dma_addr_t maxaddr); extern void __drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); extern void drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); /* Graphics Execution Manager library functions (drm_gem.c) */ int drm_gem_init(struct drm_device *dev); void drm_gem_destroy(struct drm_device *dev); void drm_gem_object_release(struct drm_gem_object *obj); void drm_gem_object_free(struct drm_gem_object *obj); struct drm_gem_object *drm_gem_object_alloc(struct drm_device *dev, size_t size); int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); int drm_gem_private_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_object_handle_free(struct drm_gem_object *obj); int drm_gem_mmap_single(struct drm_device *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); void drm_gem_pager_dtr(void *obj); #include static inline void drm_gem_object_reference(struct drm_gem_object *obj) { KASSERT(obj->refcount > 0, ("Dangling obj %p", obj)); refcount_acquire(&obj->refcount); } static inline void drm_gem_object_unreference(struct drm_gem_object *obj) { if (obj == NULL) return; if (refcount_release(&obj->refcount)) drm_gem_object_free(obj); } static inline void drm_gem_object_unreference_unlocked(struct drm_gem_object *obj) { if (obj != NULL) { struct drm_device *dev = obj->dev; DRM_LOCK(dev); drm_gem_object_unreference(obj); DRM_UNLOCK(dev); } } int drm_gem_handle_create(struct drm_file *file_priv, struct drm_gem_object *obj, u32 *handlep); int drm_gem_handle_delete(struct drm_file *filp, u32 handle); static inline void drm_gem_object_handle_reference(struct drm_gem_object *obj) { drm_gem_object_reference(obj); atomic_inc(&obj->handle_count); } static inline void drm_gem_object_handle_unreference(struct drm_gem_object *obj) { if (obj == NULL) return; if (atomic_read(&obj->handle_count) == 0) return; /* * Must bump handle count first as this may be the last * ref, in which case the object would disappear before we * checked for a name */ if (atomic_dec_and_test(&obj->handle_count)) drm_gem_object_handle_free(obj); drm_gem_object_unreference(obj); } static inline void drm_gem_object_handle_unreference_unlocked(struct drm_gem_object *obj) { if (obj == NULL) return; if (atomic_read(&obj->handle_count) == 0) return; /* * Must bump handle count first as this may be the last * ref, in which case the object would disappear before we * checked for a name */ if (atomic_dec_and_test(&obj->handle_count)) drm_gem_object_handle_free(obj); drm_gem_object_unreference_unlocked(obj); } void drm_gem_free_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset(struct drm_gem_object *obj); struct drm_gem_object *drm_gem_object_lookup(struct drm_device *dev, struct drm_file *filp, u32 handle); int drm_gem_close_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_flink_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_open_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); void drm_gem_open(struct drm_device *dev, struct drm_file *file_private); void drm_gem_release(struct drm_device *dev, struct drm_file *file_private); extern void drm_core_ioremap(struct drm_local_map *map, struct drm_device *dev); extern void drm_core_ioremap_wc(struct drm_local_map *map, struct drm_device *dev); extern void drm_core_ioremapfree(struct drm_local_map *map, struct drm_device *dev); static __inline__ struct drm_local_map *drm_core_findmap(struct drm_device *dev, unsigned int token) { struct drm_map_list *_entry; list_for_each_entry(_entry, &dev->maplist, head) if (_entry->user_token == token) return _entry->map; return NULL; } static __inline__ void drm_core_dropmap(struct drm_local_map *map) { } #include extern int drm_fill_in_dev(struct drm_device *dev, struct drm_driver *driver); extern void drm_cancel_fill_in_dev(struct drm_device *dev); int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type); /*@}*/ /* PCI section */ int drm_pci_device_is_agp(struct drm_device *dev); int drm_pci_device_is_pcie(struct drm_device *dev); extern int drm_get_pci_dev(device_t kdev, struct drm_device *dev, struct drm_driver *driver); #define DRM_PCIE_SPEED_25 1 #define DRM_PCIE_SPEED_50 2 #define DRM_PCIE_SPEED_80 4 extern int drm_pcie_get_speed_cap_mask(struct drm_device *dev, u32 *speed_mask); #define drm_can_sleep() (DRM_HZ & 1) /* Platform section */ int drm_get_platform_dev(device_t kdev, struct drm_device *dev, struct drm_driver *driver); /* FreeBSD specific -- should be moved to drm_os_freebsd.h */ #define DRM_GEM_MAPPING_MASK (3ULL << 62) #define DRM_GEM_MAPPING_KEY (2ULL << 62) /* Non-canonical address form */ #define DRM_GEM_MAX_IDX 0x3fffff #define DRM_GEM_MAPPING_IDX(o) (((o) >> 40) & DRM_GEM_MAX_IDX) #define DRM_GEM_MAPPING_OFF(i) (((uint64_t)(i)) << 40) #define DRM_GEM_MAPPING_MAPOFF(o) \ ((o) & ~(DRM_GEM_MAPPING_OFF(DRM_GEM_MAX_IDX) | DRM_GEM_MAPPING_KEY)) SYSCTL_DECL(_hw_drm); #define DRM_DEV_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define DRM_DEV_UID UID_ROOT #define DRM_DEV_GID GID_VIDEO #define DRM_WAKEUP(w) wakeup((void *)w) #define DRM_WAKEUP_INT(w) wakeup(w) #define DRM_INIT_WAITQUEUE(queue) do {(void)(queue);} while (0) #define DRM_CURPROC curthread #define DRM_STRUCTPROC struct thread #define DRM_SPINTYPE struct mtx #define DRM_SPININIT(l,name) mtx_init(l, name, NULL, MTX_DEF) #define DRM_SPINUNINIT(l) mtx_destroy(l) #define DRM_SPINLOCK(l) mtx_lock(l) #define DRM_SPINUNLOCK(u) mtx_unlock(u) #define DRM_SPINLOCK_IRQSAVE(l, irqflags) do { \ mtx_lock(l); \ (void)irqflags; \ } while (0) #define DRM_SPINUNLOCK_IRQRESTORE(u, irqflags) mtx_unlock(u) #define DRM_SPINLOCK_ASSERT(l) mtx_assert(l, MA_OWNED) #define DRM_LOCK_SLEEP(dev, chan, flags, msg, timeout) \ (sx_sleep((chan), &(dev)->dev_struct_lock, (flags), (msg), (timeout))) #if defined(INVARIANTS) #define DRM_LOCK_ASSERT(dev) sx_assert(&(dev)->dev_struct_lock, SA_XLOCKED) #define DRM_UNLOCK_ASSERT(dev) sx_assert(&(dev)->dev_struct_lock, SA_UNLOCKED) #else #define DRM_LOCK_ASSERT(d) #define DRM_UNLOCK_ASSERT(d) #endif #define DRM_SYSCTL_HANDLER_ARGS (SYSCTL_HANDLER_ARGS) enum { DRM_IS_NOT_AGP, DRM_IS_AGP, DRM_MIGHT_BE_AGP }; #define DRM_VERIFYAREA_READ( uaddr, size ) \ (!useracc(__DECONST(caddr_t, uaddr), size, VM_PROT_READ)) #define DRM_COPY_TO_USER(user, kern, size) \ copyout(kern, user, size) #define DRM_COPY_FROM_USER(kern, user, size) \ copyin(user, kern, size) #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) \ copyin(arg2, arg1, arg3) #define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3) \ copyout(arg2, arg1, arg3) #define DRM_GET_USER_UNCHECKED(val, uaddr) \ ((val) = fuword32(uaddr), 0) #define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do { \ (_map) = (_dev)->context_sareas[_ctx]; \ } while(0) /* Returns -errno to shared code */ #define DRM_WAIT_ON( ret, queue, timeout, condition ) \ for ( ret = 0 ; !ret && !(condition) ; ) { \ DRM_UNLOCK(dev); \ mtx_lock(&dev->irq_lock); \ if (!(condition)) \ ret = -mtx_sleep(&(queue), &dev->irq_lock, \ PCATCH, "drmwtq", (timeout)); \ if (ret == -ERESTART) \ ret = -ERESTARTSYS; \ mtx_unlock(&dev->irq_lock); \ DRM_LOCK(dev); \ } #define dev_err(dev, fmt, ...) \ device_printf((dev), "error: " fmt, ## __VA_ARGS__) #define dev_warn(dev, fmt, ...) \ device_printf((dev), "warning: " fmt, ## __VA_ARGS__) #define dev_info(dev, fmt, ...) \ device_printf((dev), "info: " fmt, ## __VA_ARGS__) #define dev_dbg(dev, fmt, ...) do { \ if ((drm_debug& DRM_DEBUGBITS_KMS) != 0) { \ device_printf((dev), "debug: " fmt, ## __VA_ARGS__); \ } \ } while (0) struct drm_msi_blacklist_entry { int vendor; int device; }; struct drm_vblank_info { wait_queue_head_t queue; /* vblank wait queue */ atomic_t count; /* number of VBLANK interrupts */ /* (driver must alloc the right number of counters) */ atomic_t refcount; /* number of users of vblank interrupts */ u32 last; /* protected by dev->vbl_lock, used */ /* for wraparound handling */ int enabled; /* so we don't call enable more than */ /* once per disable */ int inmodeset; /* Display driver is setting mode */ }; #ifndef DMA_BIT_MASK #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : (1ULL<<(n)) - 1) #endif #define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) enum dmi_field { DMI_NONE, DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_BIOS_DATE, DMI_SYS_VENDOR, DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, DMI_PRODUCT_SERIAL, DMI_PRODUCT_UUID, DMI_BOARD_VENDOR, DMI_BOARD_NAME, DMI_BOARD_VERSION, DMI_BOARD_SERIAL, DMI_BOARD_ASSET_TAG, DMI_CHASSIS_VENDOR, DMI_CHASSIS_TYPE, DMI_CHASSIS_VERSION, DMI_CHASSIS_SERIAL, DMI_CHASSIS_ASSET_TAG, DMI_STRING_MAX, }; struct dmi_strmatch { unsigned char slot; char substr[79]; }; struct dmi_system_id { int (*callback)(const struct dmi_system_id *); const char *ident; struct dmi_strmatch matches[4]; }; #define DMI_MATCH(a, b) {(a), (b)} bool dmi_check_system(const struct dmi_system_id *); /* Device setup support (drm_drv.c) */ int drm_probe_helper(device_t kdev, const drm_pci_id_list_t *idlist); int drm_attach_helper(device_t kdev, const drm_pci_id_list_t *idlist, struct drm_driver *driver); int drm_generic_suspend(device_t kdev); int drm_generic_resume(device_t kdev); int drm_generic_detach(device_t kdev); void drm_event_wakeup(struct drm_pending_event *e); int drm_add_busid_modesetting(struct drm_device *dev, struct sysctl_ctx_list *ctx, struct sysctl_oid *top); /* Buffer management support (drm_bufs.c) */ unsigned long drm_get_resource_start(struct drm_device *dev, unsigned int resource); unsigned long drm_get_resource_len(struct drm_device *dev, unsigned int resource); /* IRQ support (drm_irq.c) */ irqreturn_t drm_irq_handler(DRM_IRQ_ARGS); void drm_driver_irq_preinstall(struct drm_device *dev); void drm_driver_irq_postinstall(struct drm_device *dev); void drm_driver_irq_uninstall(struct drm_device *dev); /* sysctl support (drm_sysctl.h) */ extern int drm_sysctl_init(struct drm_device *dev); extern int drm_sysctl_cleanup(struct drm_device *dev); int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv); /* consistent PCI memory functions (drm_pci.c) */ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); int drm_pci_set_unique(struct drm_device *dev, struct drm_master *master, struct drm_unique *u); int drm_pci_agp_init(struct drm_device *dev); int drm_pci_enable_msi(struct drm_device *dev); void drm_pci_disable_msi(struct drm_device *dev); struct ttm_bo_device; int ttm_bo_mmap_single(struct ttm_bo_device *bdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); struct ttm_buffer_object; void ttm_bo_release_mmap(struct ttm_buffer_object *bo); #if __OS_HAS_AGP /* Memory management support (drm_memory.h) */ extern void drm_free_agp(DRM_AGP_MEM * handle, int pages); extern int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start); #ifdef FREEBSD_NOTYET extern DRM_AGP_MEM *drm_agp_bind_pages(struct drm_device *dev, struct page **pages, unsigned long num_pages, uint32_t gtt_offset, uint32_t type); #endif /* FREEBSD_NOTYET */ extern int drm_unbind_agp(DRM_AGP_MEM * handle); /* AGP/GART support (drm_agpsupport.h) */ extern struct drm_agp_head *drm_agp_init(struct drm_device *dev); extern int drm_agp_acquire(struct drm_device *dev); extern int drm_agp_acquire_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_release(struct drm_device *dev); extern int drm_agp_release_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode); extern int drm_agp_enable_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info); extern int drm_agp_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request); extern int drm_agp_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request); extern int drm_agp_free_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request); extern int drm_agp_unbind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request); extern int drm_agp_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); #else static inline void drm_free_agp(DRM_AGP_MEM * handle, int pages) { } static inline int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start) { return -ENODEV; } static inline int drm_unbind_agp(DRM_AGP_MEM * handle) { return -ENODEV; } #ifdef FREEBSD_NOTYET static inline struct agp_memory *drm_agp_bind_pages(struct drm_device *dev, struct page **pages, unsigned long num_pages, uint32_t gtt_offset, uint32_t type) { return NULL; } #endif static inline struct drm_agp_head *drm_agp_init(struct drm_device *dev) { return NULL; } static inline void drm_agp_clear(struct drm_device *dev) { } static inline int drm_agp_acquire(struct drm_device *dev) { return -ENODEV; } static inline int drm_agp_acquire_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_release(struct drm_device *dev) { return -ENODEV; } static inline int drm_agp_release_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode) { return -ENODEV; } static inline int drm_agp_enable_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info) { return -ENODEV; } static inline int drm_agp_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request) { return -ENODEV; } static inline int drm_agp_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request) { return -ENODEV; } static inline int drm_agp_free_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request) { return -ENODEV; } static inline int drm_agp_unbind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request) { return -ENODEV; } static inline int drm_agp_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } #endif /* __OS_HAS_AGP */ #endif /* __KERNEL__ */ #endif Index: projects/fuse2/sys/dev/mii/micphy.c =================================================================== --- projects/fuse2/sys/dev/mii/micphy.c (revision 350434) +++ projects/fuse2/sys/dev/mii/micphy.c (revision 350435) @@ -1,324 +1,328 @@ /*- - * Copyright (c) 2014 Ruslan Bukin + * Copyright (c) 2014,2019 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* - * Micrel KSZ9021 Gigabit Ethernet Transceiver + * Micrel KSZ8081/KSZ9021/KSZ9031 Gigabit Ethernet Transceiver */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "miidevs.h" #include "miibus_if.h" #include #include #include #include +#include #define MII_KSZPHY_EXTREG 0x0b #define KSZPHY_EXTREG_WRITE (1 << 15) #define MII_KSZPHY_EXTREG_WRITE 0x0c #define MII_KSZPHY_EXTREG_READ 0x0d #define MII_KSZPHY_CLK_CONTROL_PAD_SKEW 0x104 #define MII_KSZPHY_RX_DATA_PAD_SKEW 0x105 #define MII_KSZPHY_TX_DATA_PAD_SKEW 0x106 /* KSZ9031 */ #define MII_KSZ9031_MMD_ACCESS_CTRL 0x0d #define MII_KSZ9031_MMD_ACCESS_DATA 0x0e #define MII_KSZ9031_MMD_DATA_NOINC (1 << 14) #define MII_KSZ9031_CONTROL_PAD_SKEW 0x4 #define MII_KSZ9031_RX_DATA_PAD_SKEW 0x5 #define MII_KSZ9031_TX_DATA_PAD_SKEW 0x6 #define MII_KSZ9031_CLOCK_PAD_SKEW 0x8 #define MII_KSZ8081_PHYCTL2 0x1f #define PS_TO_REG(p) ((p) / 200) static int micphy_probe(device_t); static int micphy_attach(device_t); static void micphy_reset(struct mii_softc *); static int micphy_service(struct mii_softc *, struct mii_data *, int); static device_method_t micphy_methods[] = { /* device interface */ DEVMETHOD(device_probe, micphy_probe), DEVMETHOD(device_attach, micphy_attach), DEVMETHOD(device_detach, mii_phy_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD_END }; static devclass_t micphy_devclass; static driver_t micphy_driver = { "micphy", micphy_methods, sizeof(struct mii_softc) }; DRIVER_MODULE(micphy, miibus, micphy_driver, micphy_devclass, 0, 0); static const struct mii_phydesc micphys[] = { MII_PHY_DESC(MICREL, KSZ8081), MII_PHY_DESC(MICREL, KSZ9021), MII_PHY_DESC(MICREL, KSZ9031), MII_PHY_END }; static const struct mii_phy_funcs micphy_funcs = { micphy_service, ukphy_status, micphy_reset }; static uint32_t ksz9031_read(struct mii_softc *sc, uint32_t devaddr, uint32_t reg) { /* Set up device address and register. */ PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, devaddr); PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, reg); /* Select register data for MMD and read the value. */ PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, MII_KSZ9031_MMD_DATA_NOINC | devaddr); return (PHY_READ(sc, MII_KSZ9031_MMD_ACCESS_DATA)); } static void ksz9031_write(struct mii_softc *sc, uint32_t devaddr, uint32_t reg, uint32_t val) { /* Set up device address and register. */ PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, devaddr); PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, reg); /* Select register data for MMD and write the value. */ PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, MII_KSZ9031_MMD_DATA_NOINC | devaddr); PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, val); } static uint32_t ksz9021_read(struct mii_softc *sc, uint32_t reg) { PHY_WRITE(sc, MII_KSZPHY_EXTREG, reg); return (PHY_READ(sc, MII_KSZPHY_EXTREG_READ)); } static void ksz9021_write(struct mii_softc *sc, uint32_t reg, uint32_t val) { PHY_WRITE(sc, MII_KSZPHY_EXTREG, KSZPHY_EXTREG_WRITE | reg); PHY_WRITE(sc, MII_KSZPHY_EXTREG_WRITE, val); } static void ksz90x1_load_values(struct mii_softc *sc, phandle_t node, uint32_t dev, uint32_t reg, char *field1, uint32_t f1mask, int f1off, char *field2, uint32_t f2mask, int f2off, char *field3, uint32_t f3mask, int f3off, char *field4, uint32_t f4mask, int f4off) { pcell_t dts_value[1]; int len; int val; if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031) val = ksz9031_read(sc, dev, reg); else val = ksz9021_read(sc, reg); if ((len = OF_getproplen(node, field1)) > 0) { OF_getencprop(node, field1, dts_value, len); val &= ~(f1mask << f1off); val |= (PS_TO_REG(dts_value[0]) & f1mask) << f1off; } if (field2 != NULL && (len = OF_getproplen(node, field2)) > 0) { OF_getencprop(node, field2, dts_value, len); val &= ~(f2mask << f2off); val |= (PS_TO_REG(dts_value[0]) & f2mask) << f2off; } if (field3 != NULL && (len = OF_getproplen(node, field3)) > 0) { OF_getencprop(node, field3, dts_value, len); val &= ~(f3mask << f3off); val |= (PS_TO_REG(dts_value[0]) & f3mask) << f3off; } if (field4 != NULL && (len = OF_getproplen(node, field4)) > 0) { OF_getencprop(node, field4, dts_value, len); val &= ~(f4mask << f4off); val |= (PS_TO_REG(dts_value[0]) & f4mask) << f4off; } if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031) ksz9031_write(sc, dev, reg, val); else ksz9021_write(sc, reg, val); } static void ksz9031_load_values(struct mii_softc *sc, phandle_t node) { ksz90x1_load_values(sc, node, 2, MII_KSZ9031_CONTROL_PAD_SKEW, "txen-skew-ps", 0xf, 0, "rxdv-skew-ps", 0xf, 4, NULL, 0, 0, NULL, 0, 0); ksz90x1_load_values(sc, node, 2, MII_KSZ9031_RX_DATA_PAD_SKEW, "rxd0-skew-ps", 0xf, 0, "rxd1-skew-ps", 0xf, 4, "rxd2-skew-ps", 0xf, 8, "rxd3-skew-ps", 0xf, 12); ksz90x1_load_values(sc, node, 2, MII_KSZ9031_TX_DATA_PAD_SKEW, "txd0-skew-ps", 0xf, 0, "txd1-skew-ps", 0xf, 4, "txd2-skew-ps", 0xf, 8, "txd3-skew-ps", 0xf, 12); ksz90x1_load_values(sc, node, 2, MII_KSZ9031_CLOCK_PAD_SKEW, "rxc-skew-ps", 0x1f, 0, "txc-skew-ps", 0x1f, 5, NULL, 0, 0, NULL, 0, 0); } static void ksz9021_load_values(struct mii_softc *sc, phandle_t node) { ksz90x1_load_values(sc, node, 0, MII_KSZPHY_CLK_CONTROL_PAD_SKEW, "txen-skew-ps", 0xf, 0, "txc-skew-ps", 0xf, 4, "rxdv-skew-ps", 0xf, 8, "rxc-skew-ps", 0xf, 12); ksz90x1_load_values(sc, node, 0, MII_KSZPHY_RX_DATA_PAD_SKEW, "rxd0-skew-ps", 0xf, 0, "rxd1-skew-ps", 0xf, 4, "rxd2-skew-ps", 0xf, 8, "rxd3-skew-ps", 0xf, 12); ksz90x1_load_values(sc, node, 0, MII_KSZPHY_TX_DATA_PAD_SKEW, "txd0-skew-ps", 0xf, 0, "txd1-skew-ps", 0xf, 4, "txd2-skew-ps", 0xf, 8, "txd3-skew-ps", 0xf, 12); } static int micphy_probe(device_t dev) { return (mii_phy_dev_probe(dev, micphys, BUS_PROBE_DEFAULT)); } static int micphy_attach(device_t dev) { + mii_fdt_phy_config_t *cfg; struct mii_softc *sc; phandle_t node; device_t miibus; device_t parent; sc = device_get_softc(dev); mii_phy_dev_attach(dev, MIIF_NOMANPAUSE, &micphy_funcs, 1); mii_phy_setmedia(sc); /* Nothing further to configure for 8081 model. */ if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081) return (0); miibus = device_get_parent(dev); parent = device_get_parent(miibus); if ((node = ofw_bus_get_node(parent)) == -1) return (ENXIO); + cfg = mii_fdt_get_config(dev); + if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031) - ksz9031_load_values(sc, node); + ksz9031_load_values(sc, cfg->phynode); else - ksz9021_load_values(sc, node); + ksz9021_load_values(sc, cfg->phynode); return (0); } static void micphy_reset(struct mii_softc *sc) { int reg; /* * The 8081 has no "sticky bits" that survive a soft reset; several bits * in the Phy Control Register 2 must be preserved across the reset. * These bits are set up by the bootloader; they control how the phy * interfaces to the board (such as clock frequency and LED behavior). */ if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081) reg = PHY_READ(sc, MII_KSZ8081_PHYCTL2); mii_phy_reset(sc); if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081) PHY_WRITE(sc, MII_KSZ8081_PHYCTL2, reg); } static int micphy_service(struct mii_softc *sc, struct mii_data *mii, int cmd) { switch (cmd) { case MII_POLLSTAT: break; case MII_MEDIACHG: mii_phy_setmedia(sc); break; case MII_TICK: if (mii_phy_tick(sc) == EJUSTRETURN) return (0); break; } /* Update the media status. */ PHY_STATUS(sc); /* Callback if something changed. */ mii_phy_update(sc, cmd); return (0); } Index: projects/fuse2/sys/dev/nvme/nvme.h =================================================================== --- projects/fuse2/sys/dev/nvme/nvme.h (revision 350434) +++ projects/fuse2/sys/dev/nvme/nvme.h (revision 350435) @@ -1,1536 +1,1674 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NVME_H__ #define __NVME_H__ #ifdef _KERNEL #include #endif #include #include #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) /* * Macros to deal with NVME revisions, as defined VS register */ #define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) #define NVME_MAJOR(r) (((r) >> 16) & 0xffff) #define NVME_MINOR(r) (((r) >> 8) & 0xff) /* * Use to mark a command to apply to all namespaces, or to retrieve global * log pages. */ #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) /* Cap nvme to 1MB transfers driver explodes with larger sizes */ #define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) /* Register field definitions */ #define NVME_CAP_LO_REG_MQES_SHIFT (0) #define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) #define NVME_CAP_LO_REG_CQR_SHIFT (16) #define NVME_CAP_LO_REG_CQR_MASK (0x1) #define NVME_CAP_LO_REG_AMS_SHIFT (17) #define NVME_CAP_LO_REG_AMS_MASK (0x3) #define NVME_CAP_LO_REG_TO_SHIFT (24) #define NVME_CAP_LO_REG_TO_MASK (0xFF) #define NVME_CAP_LO_MQES(x) \ (((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK) #define NVME_CAP_LO_CQR(x) \ (((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK) #define NVME_CAP_LO_AMS(x) \ (((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK) #define NVME_CAP_LO_TO(x) \ (((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK) #define NVME_CAP_HI_REG_DSTRD_SHIFT (0) #define NVME_CAP_HI_REG_DSTRD_MASK (0xF) #define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) #define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) #define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) #define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) #define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) #define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) #define NVME_CAP_HI_DSTRD(x) \ (((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK) #define NVME_CAP_HI_CSS_NVM(x) \ (((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK) #define NVME_CAP_HI_MPSMIN(x) \ (((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK) #define NVME_CAP_HI_MPSMAX(x) \ (((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK) #define NVME_CC_REG_EN_SHIFT (0) #define NVME_CC_REG_EN_MASK (0x1) #define NVME_CC_REG_CSS_SHIFT (4) #define NVME_CC_REG_CSS_MASK (0x7) #define NVME_CC_REG_MPS_SHIFT (7) #define NVME_CC_REG_MPS_MASK (0xF) #define NVME_CC_REG_AMS_SHIFT (11) #define NVME_CC_REG_AMS_MASK (0x7) #define NVME_CC_REG_SHN_SHIFT (14) #define NVME_CC_REG_SHN_MASK (0x3) #define NVME_CC_REG_IOSQES_SHIFT (16) #define NVME_CC_REG_IOSQES_MASK (0xF) #define NVME_CC_REG_IOCQES_SHIFT (20) #define NVME_CC_REG_IOCQES_MASK (0xF) #define NVME_CSTS_REG_RDY_SHIFT (0) #define NVME_CSTS_REG_RDY_MASK (0x1) #define NVME_CSTS_REG_CFS_SHIFT (1) #define NVME_CSTS_REG_CFS_MASK (0x1) #define NVME_CSTS_REG_SHST_SHIFT (2) #define NVME_CSTS_REG_SHST_MASK (0x3) #define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) #define NVME_AQA_REG_ASQS_SHIFT (0) #define NVME_AQA_REG_ASQS_MASK (0xFFF) #define NVME_AQA_REG_ACQS_SHIFT (16) #define NVME_AQA_REG_ACQS_MASK (0xFFF) /* Command field definitions */ #define NVME_CMD_FUSE_SHIFT (8) #define NVME_CMD_FUSE_MASK (0x3) #define NVME_STATUS_P_SHIFT (0) #define NVME_STATUS_P_MASK (0x1) #define NVME_STATUS_SC_SHIFT (1) #define NVME_STATUS_SC_MASK (0xFF) #define NVME_STATUS_SCT_SHIFT (9) #define NVME_STATUS_SCT_MASK (0x7) #define NVME_STATUS_M_SHIFT (14) #define NVME_STATUS_M_MASK (0x1) #define NVME_STATUS_DNR_SHIFT (15) #define NVME_STATUS_DNR_MASK (0x1) #define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) #define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) #define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) #define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) #define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) #define NVME_PWR_ST_MPS_SHIFT (0) #define NVME_PWR_ST_MPS_MASK (0x1) #define NVME_PWR_ST_NOPS_SHIFT (1) #define NVME_PWR_ST_NOPS_MASK (0x1) #define NVME_PWR_ST_RRT_SHIFT (0) #define NVME_PWR_ST_RRT_MASK (0x1F) #define NVME_PWR_ST_RRL_SHIFT (0) #define NVME_PWR_ST_RRL_MASK (0x1F) #define NVME_PWR_ST_RWT_SHIFT (0) #define NVME_PWR_ST_RWT_MASK (0x1F) #define NVME_PWR_ST_RWL_SHIFT (0) #define NVME_PWR_ST_RWL_MASK (0x1F) #define NVME_PWR_ST_IPS_SHIFT (6) #define NVME_PWR_ST_IPS_MASK (0x3) #define NVME_PWR_ST_APW_SHIFT (0) #define NVME_PWR_ST_APW_MASK (0x7) #define NVME_PWR_ST_APS_SHIFT (6) #define NVME_PWR_ST_APS_MASK (0x3) /** Controller Multi-path I/O and Namespace Sharing Capabilities */ /* More then one port */ #define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) #define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) /* More then one controller */ #define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) #define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) /* SR-IOV Virtual Function */ #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) +/* Asymmetric Namespace Access Reporting */ +#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3) +#define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1) /** OACS - optional admin command support */ /* supports security send/receive commands */ #define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) #define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) /* supports format nvm command */ #define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) #define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) /* supports firmware activate/download commands */ #define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) #define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) /* supports namespace management commands */ #define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) #define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) /* supports Device Self-test command */ #define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) #define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) /* supports Directives */ #define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) #define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) /* supports NVMe-MI Send/Receive */ #define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) #define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) /* supports Virtualization Management */ #define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) #define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) /* supports Doorbell Buffer Config */ #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) +/* supports Get LBA Status */ +#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9) +#define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1) /** firmware updates */ /* first slot is read-only */ #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) /* number of firmware slots */ #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) +/* firmware activation without reset */ +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4) +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1) /** log page attributes */ /* per namespace smart/health log page */ #define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) #define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) /** AVSCC - admin vendor specific command configuration */ /* admin vendor specific commands use spec format */ #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) /** Autonomous Power State Transition Attributes */ /* Autonomous Power State Transitions supported */ #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) +/** Sanitize Capabilities */ +/* Crypto Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0) +#define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1) +/* Block Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1) +#define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1) +/* Overwrite Support */ +#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2) +#define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1) +/* No-Deallocate Inhibited */ +#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29) +#define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1) +/* No-Deallocate Modifies Media After Sanitize */ +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2) + /** submission queue entry size */ #define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) /** completion queue entry size */ #define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) #define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) #define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) /** optional nvm command support */ #define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) #define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) #define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) #define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) #define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7) +#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1) /** Fused Operation Support */ #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) #define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) /** Format NVM Attributes */ #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) #define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) /** volatile write cache */ +/* volatile write cache present */ #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) #define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) +/* flush all namespaces supported */ +#define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3) +#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0) +#define NVME_CTRLR_DATA_VWC_ALL_NO (2) +#define NVME_CTRLR_DATA_VWC_ALL_YES (3) /** namespace features */ /* thin provisioning */ #define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) #define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) /* NAWUN, NAWUPF, and NACWU fields are valid */ #define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) #define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) /* Deallocated or Unwritten Logical Block errors supported */ #define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) #define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) /* NGUID and EUI64 fields are not reusable */ #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) +/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */ +#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4) +#define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1) /** formatted lba size */ #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) #define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) #define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) #define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) /** metadata capabilities */ /* metadata can be transferred as part of data prp list */ #define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) #define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) /* metadata can be transferred with separate metadata pointer */ #define NVME_NS_DATA_MC_POINTER_SHIFT (1) #define NVME_NS_DATA_MC_POINTER_MASK (0x1) /** end-to-end data protection capabilities */ /* protection information type 1 */ #define NVME_NS_DATA_DPC_PIT1_SHIFT (0) #define NVME_NS_DATA_DPC_PIT1_MASK (0x1) /* protection information type 2 */ #define NVME_NS_DATA_DPC_PIT2_SHIFT (1) #define NVME_NS_DATA_DPC_PIT2_MASK (0x1) /* protection information type 3 */ #define NVME_NS_DATA_DPC_PIT3_SHIFT (2) #define NVME_NS_DATA_DPC_PIT3_MASK (0x1) /* first eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_START_SHIFT (3) #define NVME_NS_DATA_DPC_MD_START_MASK (0x1) /* last eight bytes of metadata */ #define NVME_NS_DATA_DPC_MD_END_SHIFT (4) #define NVME_NS_DATA_DPC_MD_END_MASK (0x1) /** end-to-end data protection type settings */ /* protection information type */ #define NVME_NS_DATA_DPS_PIT_SHIFT (0) #define NVME_NS_DATA_DPS_PIT_MASK (0x7) /* 1 == protection info transferred at start of metadata */ /* 0 == protection info transferred at end of metadata */ #define NVME_NS_DATA_DPS_MD_START_SHIFT (3) #define NVME_NS_DATA_DPS_MD_START_MASK (0x1) /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ /* the namespace may be attached to two or more controllers */ #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) /** Reservation Capabilities */ /* Persist Through Power Loss */ #define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) #define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) /* supports the Write Exclusive */ #define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) #define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) /* supports the Exclusive Access */ #define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) #define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) /* supports the Write Exclusive – Registrants Only */ #define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) #define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) /* supports the Exclusive Access - Registrants Only */ #define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) #define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) /* supports the Write Exclusive – All Registrants */ #define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) #define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) /* supports the Exclusive Access - All Registrants */ #define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) #define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) /* Ignore Existing Key is used as defined in revision 1.3 or later */ #define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) #define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) /** Format Progress Indicator */ /* percentage of the Format NVM command that remains to be completed */ #define NVME_NS_DATA_FPI_PERC_SHIFT (0) #define NVME_NS_DATA_FPI_PERC_MASK (0x7f) /* namespace supports the Format Progress Indicator */ #define NVME_NS_DATA_FPI_SUPP_SHIFT (7) #define NVME_NS_DATA_FPI_SUPP_MASK (0x1) /** Deallocate Logical Block Features */ /* deallocated logical block read behavior */ #define NVME_NS_DATA_DLFEAT_READ_SHIFT (0) #define NVME_NS_DATA_DLFEAT_READ_MASK (0x07) #define NVME_NS_DATA_DLFEAT_READ_NR (0x00) #define NVME_NS_DATA_DLFEAT_READ_00 (0x01) #define NVME_NS_DATA_DLFEAT_READ_FF (0x02) /* supports the Deallocate bit in the Write Zeroes */ #define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3) #define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01) /* Guard field for deallocated logical blocks is set to the CRC */ #define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4) #define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01) /** lba format support */ /* metadata size */ #define NVME_NS_DATA_LBAF_MS_SHIFT (0) #define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) /* lba data size */ #define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) #define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) /* relative performance */ #define NVME_NS_DATA_LBAF_RP_SHIFT (24) #define NVME_NS_DATA_LBAF_RP_MASK (0x3) enum nvme_critical_warning_state { NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, NVME_CRIT_WARN_ST_READ_ONLY = 0x8, NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, }; #define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) /* slot for current FW */ #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) /* CC register SHN field values */ enum shn_value { NVME_SHN_NORMAL = 0x1, NVME_SHN_ABRUPT = 0x2, }; /* CSTS register SHST field values */ enum shst_value { NVME_SHST_NORMAL = 0x0, NVME_SHST_OCCURRING = 0x1, NVME_SHST_COMPLETE = 0x2, }; struct nvme_registers { /** controller capabilities */ uint32_t cap_lo; uint32_t cap_hi; uint32_t vs; /* version */ uint32_t intms; /* interrupt mask set */ uint32_t intmc; /* interrupt mask clear */ /** controller configuration */ uint32_t cc; uint32_t reserved1; /** controller status */ uint32_t csts; uint32_t reserved2; /** admin queue attributes */ uint32_t aqa; uint64_t asq; /* admin submission queue base addr */ uint64_t acq; /* admin completion queue base addr */ uint32_t reserved3[0x3f2]; struct { uint32_t sq_tdbl; /* submission queue tail doorbell */ uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1] __packed; } __packed; _Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); struct nvme_command { /* dword 0 */ uint8_t opc; /* opcode */ uint8_t fuse; /* fused operation */ uint16_t cid; /* command identifier */ /* dword 1 */ uint32_t nsid; /* namespace identifier */ /* dword 2-3 */ uint32_t rsvd2; uint32_t rsvd3; /* dword 4-5 */ uint64_t mptr; /* metadata pointer */ /* dword 6-7 */ uint64_t prp1; /* prp entry 1 */ /* dword 8-9 */ uint64_t prp2; /* prp entry 2 */ /* dword 10-15 */ uint32_t cdw10; /* command-specific */ uint32_t cdw11; /* command-specific */ uint32_t cdw12; /* command-specific */ uint32_t cdw13; /* command-specific */ uint32_t cdw14; /* command-specific */ uint32_t cdw15; /* command-specific */ } __packed; _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); struct nvme_completion { /* dword 0 */ uint32_t cdw0; /* command-specific */ /* dword 1 */ uint32_t rsvd1; /* dword 2 */ uint16_t sqhd; /* submission queue head pointer */ uint16_t sqid; /* submission queue identifier */ /* dword 3 */ uint16_t cid; /* command identifier */ uint16_t status; } __packed; _Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); struct nvme_dsm_range { uint32_t attributes; uint32_t length; uint64_t starting_lba; } __packed; /* Largest DSM Trim that can be done */ #define NVME_MAX_DSM_TRIM 4096 _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); /* status code types */ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; /* generic command status codes */ enum nvme_generic_command_status_code { NVME_SC_SUCCESS = 0x00, NVME_SC_INVALID_OPCODE = 0x01, NVME_SC_INVALID_FIELD = 0x02, NVME_SC_COMMAND_ID_CONFLICT = 0x03, NVME_SC_DATA_TRANSFER_ERROR = 0x04, NVME_SC_ABORTED_POWER_LOSS = 0x05, NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, NVME_SC_ABORTED_BY_REQUEST = 0x07, NVME_SC_ABORTED_SQ_DELETION = 0x08, NVME_SC_ABORTED_FAILED_FUSED = 0x09, NVME_SC_ABORTED_MISSING_FUSED = 0x0a, NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, NVME_SC_INVALID_USE_OF_CMB = 0x12, NVME_SC_PRP_OFFET_INVALID = 0x13, NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, NVME_SC_OPERATION_DENIED = 0x15, NVME_SC_SGL_OFFSET_INVALID = 0x16, /* 0x17 - reserved */ NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, NVME_SC_SANITIZE_FAILED = 0x1c, NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, NVME_SC_NAMESPACE_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, NVME_SC_FORMAT_IN_PROGRESS = 0x84, }; /* command specific status codes */ enum nvme_command_specific_status_code { NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, /* 0x04 - reserved */ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, NVME_SC_INVALID_LOG_PAGE = 0x09, NVME_SC_INVALID_FORMAT = 0x0a, NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, NVME_SC_INVALID_QUEUE_DELETION = 0x0c, NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, NVME_SC_FW_ACT_PROHIBITED = 0x13, NVME_SC_OVERLAPPING_RANGE = 0x14, NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, NVME_SC_NS_ID_UNAVAILABLE = 0x16, /* 0x17 - reserved */ NVME_SC_NS_ALREADY_ATTACHED = 0x18, NVME_SC_NS_IS_PRIVATE = 0x19, NVME_SC_NS_NOT_ATTACHED = 0x1a, NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, NVME_SC_CTRLR_LIST_INVALID = 0x1c, NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, NVME_SC_INVALID_CTRLR_ID = 0x1f, NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, NVME_SC_INVALID_RESOURCE_ID = 0x22, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, }; /* media error status codes */ enum nvme_media_error_status_code { NVME_SC_WRITE_FAULTS = 0x80, NVME_SC_UNRECOVERED_READ_ERROR = 0x81, NVME_SC_GUARD_CHECK_ERROR = 0x82, NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, NVME_SC_COMPARE_FAILURE = 0x85, NVME_SC_ACCESS_DENIED = 0x86, NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, }; /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, NVME_OPC_CREATE_IO_SQ = 0x01, NVME_OPC_GET_LOG_PAGE = 0x02, /* 0x03 - reserved */ NVME_OPC_DELETE_IO_CQ = 0x04, NVME_OPC_CREATE_IO_CQ = 0x05, NVME_OPC_IDENTIFY = 0x06, /* 0x07 - reserved */ NVME_OPC_ABORT = 0x08, NVME_OPC_SET_FEATURES = 0x09, NVME_OPC_GET_FEATURES = 0x0a, /* 0x0b - reserved */ NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, NVME_OPC_DEVICE_SELF_TEST = 0x14, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, NVME_OPC_KEEP_ALIVE = 0x18, NVME_OPC_DIRECTIVE_SEND = 0x19, NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, NVME_OPC_NVME_MI_SEND = 0x1d, NVME_OPC_NVME_MI_RECEIVE = 0x1e, NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, NVME_OPC_SANITIZE = 0x84, }; /* nvme nvm opcodes */ enum nvme_nvm_opcode { NVME_OPC_FLUSH = 0x00, NVME_OPC_WRITE = 0x01, NVME_OPC_READ = 0x02, /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, /* 0x06 - reserved */ NVME_OPC_WRITE_ZEROES = 0x08, /* 0x07 - reserved */ NVME_OPC_DATASET_MANAGEMENT = 0x09, /* 0x0a-0x0c - reserved */ NVME_OPC_RESERVATION_REGISTER = 0x0d, NVME_OPC_RESERVATION_REPORT = 0x0e, /* 0x0f-0x10 - reserved */ NVME_OPC_RESERVATION_ACQUIRE = 0x11, /* 0x12-0x14 - reserved */ NVME_OPC_RESERVATION_RELEASE = 0x15, }; enum nvme_feature { /* 0x00 - reserved */ NVME_FEAT_ARBITRATION = 0x01, NVME_FEAT_POWER_MANAGEMENT = 0x02, NVME_FEAT_LBA_RANGE_TYPE = 0x03, NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, NVME_FEAT_ERROR_RECOVERY = 0x05, NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, NVME_FEAT_NUMBER_OF_QUEUES = 0x07, NVME_FEAT_INTERRUPT_COALESCING = 0x08, NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, NVME_FEAT_WRITE_ATOMICITY = 0x0A, NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, NVME_FEAT_TIMESTAMP = 0x0E, NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, /* 0x12-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, /* 0x81-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; enum nvme_dsm_attribute { NVME_DSM_ATTR_INTEGRAL_READ = 0x1, NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, NVME_DSM_ATTR_DEALLOCATE = 0x4, }; enum nvme_activate_action { NVME_AA_REPLACE_NO_ACTIVATE = 0x0, NVME_AA_REPLACE_ACTIVATE = 0x1, NVME_AA_ACTIVATE = 0x2, }; struct nvme_power_state { /** Maximum Power */ uint16_t mp; /* Maximum Power */ uint8_t ps_rsvd1; uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ uint32_t enlat; /* Entry Latency */ uint32_t exlat; /* Exit Latency */ uint8_t rrt; /* Relative Read Throughput */ uint8_t rrl; /* Relative Read Latency */ uint8_t rwt; /* Relative Write Throughput */ uint8_t rwl; /* Relative Write Latency */ uint16_t idlp; /* Idle Power */ uint8_t ips; /* Idle Power Scale */ uint8_t ps_rsvd8; uint16_t actp; /* Active Power */ uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ uint8_t ps_rsvd10[9]; } __packed; _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); #define NVME_SERIAL_NUMBER_LENGTH 20 #define NVME_MODEL_NUMBER_LENGTH 40 #define NVME_FIRMWARE_REVISION_LENGTH 8 struct nvme_controller_data { /* bytes 0-255: controller capabilities and features */ /** pci vendor id */ uint16_t vid; /** pci subsystem vendor id */ uint16_t ssvid; /** serial number */ uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; /** model number */ uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; /** firmware revision */ uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; /** recommended arbitration burst */ uint8_t rab; /** ieee oui identifier */ uint8_t ieee[3]; /** multi-interface capabilities */ uint8_t mic; /** maximum data transfer size */ uint8_t mdts; /** Controller ID */ uint16_t ctrlr_id; /** Version */ uint32_t ver; /** RTD3 Resume Latency */ uint32_t rtd3r; /** RTD3 Enter Latency */ uint32_t rtd3e; /** Optional Asynchronous Events Supported */ uint32_t oaes; /* bitfield really */ /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ - uint8_t reserved1[12]; + /** Read Recovery Levels Supported */ + uint16_t rrls; + uint8_t reserved1[9]; + + /** Controller Type */ + uint8_t cntrltype; + /** FRU Globally Unique Identifier */ uint8_t fguid[16]; - uint8_t reserved2[128]; + /** Command Retry Delay Time 1 */ + uint16_t crdt1; + /** Command Retry Delay Time 2 */ + uint16_t crdt2; + + /** Command Retry Delay Time 3 */ + uint16_t crdt3; + + uint8_t reserved2[122]; + /* bytes 256-511: admin command set attributes */ /** optional admin command support */ uint16_t oacs; /** abort command limit */ uint8_t acl; /** asynchronous event request limit */ uint8_t aerl; /** firmware updates */ uint8_t frmw; /** log page attributes */ uint8_t lpa; /** error log page entries */ uint8_t elpe; /** number of power states supported */ uint8_t npss; /** admin vendor specific command configuration */ uint8_t avscc; /** Autonomous Power State Transition Attributes */ uint8_t apsta; /** Warning Composite Temperature Threshold */ uint16_t wctemp; /** Critical Composite Temperature Threshold */ uint16_t cctemp; /** Maximum Time for Firmware Activation */ uint16_t mtfa; /** Host Memory Buffer Preferred Size */ uint32_t hmpre; /** Host Memory Buffer Minimum Size */ uint32_t hmmin; /** Name space capabilities */ struct { /* if nsmgmt, report tnvmcap and unvmcap */ uint8_t tnvmcap[16]; uint8_t unvmcap[16]; } __packed untncap; /** Replay Protected Memory Block Support */ uint32_t rpmbs; /* Really a bitfield */ /** Extended Device Self-test Time */ uint16_t edstt; /** Device Self-test Options */ uint8_t dsto; /* Really a bitfield */ /** Firmware Update Granularity */ uint8_t fwug; /** Keep Alive Support */ uint16_t kas; /** Host Controlled Thermal Management Attributes */ uint16_t hctma; /* Really a bitfield */ /** Minimum Thermal Management Temperature */ uint16_t mntmt; /** Maximum Thermal Management Temperature */ uint16_t mxtmt; /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ - uint8_t reserved3[180]; + /** Host Memory Buffer Minimum Descriptor Entry Size */ + uint32_t hmminds; + + /** Host Memory Maximum Descriptors Entries */ + uint16_t hmmaxd; + + /** NVM Set Identifier Maximum */ + uint16_t nsetidmax; + + /** Endurance Group Identifier Maximum */ + uint16_t endgidmax; + + /** ANA Transition Time */ + uint8_t anatt; + + /** Asymmetric Namespace Access Capabilities */ + uint8_t anacap; + + /** ANA Group Identifier Maximum */ + uint32_t anagrpmax; + + /** Number of ANA Group Identifiers */ + uint32_t nanagrpid; + + /** Persistent Event Log Size */ + uint32_t pels; + + uint8_t reserved3[156]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ uint8_t sqes; /** completion queue entry size */ uint8_t cqes; /** Maximum Outstanding Commands */ uint16_t maxcmd; /** number of namespaces */ uint32_t nn; /** optional nvm command support */ uint16_t oncs; /** fused operation support */ uint16_t fuses; /** format nvm attributes */ uint8_t fna; /** volatile write cache */ uint8_t vwc; /** Atomic Write Unit Normal */ uint16_t awun; /** Atomic Write Unit Power Fail */ uint16_t awupf; /** NVM Vendor Specific Command Configuration */ uint8_t nvscc; - uint8_t reserved5; + /** Namespace Write Protection Capabilities */ + uint8_t nwpc; + /** Atomic Compare & Write Unit */ uint16_t acwu; uint16_t reserved6; /** SGL Support */ uint32_t sgls; + /** Maximum Number of Allowed Namespaces */ + uint32_t mnan; + /* bytes 540-767: Reserved */ - uint8_t reserved7[228]; + uint8_t reserved7[224]; /** NVM Subsystem NVMe Qualified Name */ uint8_t subnqn[256]; /* bytes 1024-1791: Reserved */ uint8_t reserved8[768]; /* bytes 1792-2047: NVMe over Fabrics specification */ uint8_t reserved9[256]; /* bytes 2048-3071: power state descriptors */ struct nvme_power_state power_state[32]; /* bytes 3072-4095: vendor specific */ uint8_t vs[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); struct nvme_namespace_data { /** namespace size */ uint64_t nsze; /** namespace capacity */ uint64_t ncap; /** namespace utilization */ uint64_t nuse; /** namespace features */ uint8_t nsfeat; /** number of lba formats */ uint8_t nlbaf; /** formatted lba size */ uint8_t flbas; /** metadata capabilities */ uint8_t mc; /** end-to-end data protection capabilities */ uint8_t dpc; /** end-to-end data protection type settings */ uint8_t dps; /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ uint8_t nmic; /** Reservation Capabilities */ uint8_t rescap; /** Format Progress Indicator */ uint8_t fpi; /** Deallocate Logical Block Features */ uint8_t dlfeat; /** Namespace Atomic Write Unit Normal */ uint16_t nawun; /** Namespace Atomic Write Unit Power Fail */ uint16_t nawupf; /** Namespace Atomic Compare & Write Unit */ uint16_t nacwu; /** Namespace Atomic Boundary Size Normal */ uint16_t nabsn; /** Namespace Atomic Boundary Offset */ uint16_t nabo; /** Namespace Atomic Boundary Size Power Fail */ uint16_t nabspf; /** Namespace Optimal IO Boundary */ uint16_t noiob; /** NVM Capacity */ uint8_t nvmcap[16]; - /* bytes 64-103: Reserved */ - uint8_t reserved5[40]; + /** Namespace Preferred Write Granularity */ + uint16_t npwg; + /** Namespace Preferred Write Alignment */ + uint16_t npwa; + + /** Namespace Preferred Deallocate Granularity */ + uint16_t npdg; + + /** Namespace Preferred Deallocate Alignment */ + uint16_t npda; + + /** Namespace Optimal Write Size */ + uint16_t nows; + + /* bytes 74-91: Reserved */ + uint8_t reserved5[18]; + + /** ANA Group Identifier */ + uint32_t anagrpid; + + /* bytes 96-98: Reserved */ + uint8_t reserved6[3]; + + /** Namespace Attributes */ + uint8_t nsattr; + + /** NVM Set Identifier */ + uint16_t nvmsetid; + + /** Endurance Group Identifier */ + uint16_t endgid; + /** Namespace Globally Unique Identifier */ uint8_t nguid[16]; /** IEEE Extended Unique Identifier */ uint8_t eui64[8]; /** lba format support */ uint32_t lbaf[16]; - uint8_t reserved6[192]; + uint8_t reserved7[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); enum nvme_log_page { /* 0x00 - reserved */ NVME_LOG_ERROR = 0x01, NVME_LOG_HEALTH_INFORMATION = 0x02, NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, /* 0xC0-0xFF - vendor specific */ /* * The following are Intel Specific log pages, but they seem * to be widely implemented. */ INTEL_LOG_READ_LAT_LOG = 0xc1, INTEL_LOG_WRITE_LAT_LOG = 0xc2, INTEL_LOG_TEMP_STATS = 0xc5, INTEL_LOG_ADD_SMART = 0xca, INTEL_LOG_DRIVE_MKT_NAME = 0xdd, /* * HGST log page, with lots ofs sub pages. */ HGST_INFO_LOG = 0xc1, }; struct nvme_error_information_entry { uint64_t error_count; uint16_t sqid; uint16_t cid; uint16_t status; uint16_t error_location; uint64_t lba; uint32_t nsid; uint8_t vendor_specific; uint8_t reserved[35]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); struct nvme_health_information_page { uint8_t critical_warning; uint16_t temperature; uint8_t available_spare; uint8_t available_spare_threshold; uint8_t percentage_used; uint8_t reserved[26]; /* * Note that the following are 128-bit values, but are * defined as an array of 2 64-bit values. */ /* Data Units Read is always in 512-byte units. */ uint64_t data_units_read[2]; /* Data Units Written is always in 512-byte units. */ uint64_t data_units_written[2]; /* For NVM command set, this includes Compare commands. */ uint64_t host_read_commands[2]; uint64_t host_write_commands[2]; /* Controller Busy Time is reported in minutes. */ uint64_t controller_busy_time[2]; uint64_t power_cycles[2]; uint64_t power_on_hours[2]; uint64_t unsafe_shutdowns[2]; uint64_t media_errors[2]; uint64_t num_error_info_log_entries[2]; uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; uint8_t reserved2[296]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); struct nvme_firmware_page { uint8_t afi; uint8_t reserved[7]; uint64_t revision[7]; /* revisions for 7 slots */ uint8_t reserved2[448]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); struct nvme_ns_list { uint32_t ns[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); struct intel_log_temp_stats { uint64_t current; uint64_t overtemp_flag_last; uint64_t overtemp_flag_life; uint64_t max_temp; uint64_t min_temp; uint64_t _rsvd[5]; uint64_t max_oper_temp; uint64_t min_oper_temp; uint64_t est_offset; } __packed __aligned(4); _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { enum nvme_nvm_opcode opc; uint32_t size; uint32_t time; /* in seconds */ uint32_t num_threads; uint32_t flags; uint64_t io_completed[NVME_TEST_MAX_THREADS]; }; enum nvme_io_test_flags { /* * Specifies whether dev_refthread/dev_relthread should be * called during NVME_BIO_TEST. Ignored for other test * types. */ NVME_TEST_FLAG_REFTHREAD = 0x1, }; struct nvme_pt_command { /* * cmd is used to specify a passthrough command to a controller or * namespace. * * The following fields from cmd may be specified by the caller: * * opc (opcode) * * nsid (namespace id) - for admin commands only * * cdw10-cdw15 * * Remaining fields must be set to 0 by the caller. */ struct nvme_command cmd; /* * cpl returns completion status for the passthrough command * specified by cmd. * * The following fields will be filled out by the driver, for * consumption by the caller: * * cdw0 * * status (except for phase) * * Remaining fields will be set to 0 by the driver. */ struct nvme_completion cpl; /* buf is the data buffer associated with this passthrough command. */ void * buf; /* * len is the length of the data buffer associated with this * passthrough command. */ uint32_t len; /* * is_read = 1 if the passthrough command will read data into the * supplied buffer from the controller. * * is_read = 0 if the passthrough command will write data from the * supplied buffer to the controller. */ uint32_t is_read; /* * driver_lock is used by the driver only. It must be set to 0 * by the caller. */ struct mtx * driver_lock; }; #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, uint32_t, void *, uint32_t); typedef void (*nvme_cons_fail_fn_t)(void *); enum nvme_namespace_flags { NVME_NS_DEALLOCATE_SUPPORTED = 0x1, NVME_NS_FLUSH_SUPPORTED = 0x2, }; int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd); /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); /* NVM I/O functions */ int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len); /* Registration functions */ struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn); void nvme_unregister_consumer(struct nvme_consumer *consumer); /* Controller helper functions */ device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr); static inline bool nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd) { /* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */ return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) & NVME_CTRLR_DATA_ONCS_DSM_MASK); } /* Namespace helper functions */ uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); uint64_t nvme_ns_get_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); const char * nvme_ns_get_model_number(struct nvme_namespace *ns); const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns); uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); /* * Command building helper functions -- shared with CAM * These functions assume allocator zeros out cmd structure * CAM's xpt_get_ccb and the request allocator for nvme both * do zero'd allocations. */ static inline void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) { cmd->opc = NVME_OPC_FLUSH; cmd->nsid = htole32(nsid); } static inline void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, uint64_t lba, uint32_t count) { cmd->opc = rwcmd; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(lba & 0xffffffffu); cmd->cdw11 = htole32(lba >> 32); cmd->cdw12 = htole32(count-1); } static inline void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); } static inline void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); } static inline void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, uint32_t num_ranges) { cmd->opc = NVME_OPC_DATASET_MANAGEMENT; cmd->nsid = htole32(nsid); cmd->cdw10 = htole32(num_ranges - 1); cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); } extern int nvme_use_nvd; #endif /* _KERNEL */ /* Endianess conversion functions for NVMe structs */ static inline void nvme_completion_swapbytes(struct nvme_completion *s) { s->cdw0 = le32toh(s->cdw0); /* omit rsvd1 */ s->sqhd = le16toh(s->sqhd); s->sqid = le16toh(s->sqid); /* omit cid */ s->status = le16toh(s->status); } static inline void nvme_power_state_swapbytes(struct nvme_power_state *s) { s->mp = le16toh(s->mp); s->enlat = le32toh(s->enlat); s->exlat = le32toh(s->exlat); s->idlp = le16toh(s->idlp); s->actp = le16toh(s->actp); } static inline void nvme_controller_data_swapbytes(struct nvme_controller_data *s) { int i; s->vid = le16toh(s->vid); s->ssvid = le16toh(s->ssvid); s->ctrlr_id = le16toh(s->ctrlr_id); s->ver = le32toh(s->ver); s->rtd3r = le32toh(s->rtd3r); s->rtd3e = le32toh(s->rtd3e); s->oaes = le32toh(s->oaes); s->ctratt = le32toh(s->ctratt); + s->rrls = le16toh(s->rrls); + s->crdt1 = le16toh(s->crdt1); + s->crdt2 = le16toh(s->crdt2); + s->crdt3 = le16toh(s->crdt3); s->oacs = le16toh(s->oacs); s->wctemp = le16toh(s->wctemp); s->cctemp = le16toh(s->cctemp); s->mtfa = le16toh(s->mtfa); s->hmpre = le32toh(s->hmpre); s->hmmin = le32toh(s->hmmin); s->rpmbs = le32toh(s->rpmbs); s->edstt = le16toh(s->edstt); s->kas = le16toh(s->kas); s->hctma = le16toh(s->hctma); s->mntmt = le16toh(s->mntmt); s->mxtmt = le16toh(s->mxtmt); s->sanicap = le32toh(s->sanicap); + s->hmminds = le32toh(s->hmminds); + s->hmmaxd = le16toh(s->hmmaxd); + s->nsetidmax = le16toh(s->nsetidmax); + s->endgidmax = le16toh(s->endgidmax); + s->anagrpmax = le32toh(s->anagrpmax); + s->nanagrpid = le32toh(s->nanagrpid); + s->pels = le32toh(s->pels); s->maxcmd = le16toh(s->maxcmd); s->nn = le32toh(s->nn); s->oncs = le16toh(s->oncs); s->fuses = le16toh(s->fuses); s->awun = le16toh(s->awun); s->awupf = le16toh(s->awupf); s->acwu = le16toh(s->acwu); s->sgls = le32toh(s->sgls); + s->mnan = le32toh(s->mnan); for (i = 0; i < 32; i++) nvme_power_state_swapbytes(&s->power_state[i]); } static inline void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) { int i; s->nsze = le64toh(s->nsze); s->ncap = le64toh(s->ncap); s->nuse = le64toh(s->nuse); s->nawun = le16toh(s->nawun); s->nawupf = le16toh(s->nawupf); s->nacwu = le16toh(s->nacwu); s->nabsn = le16toh(s->nabsn); s->nabo = le16toh(s->nabo); s->nabspf = le16toh(s->nabspf); s->noiob = le16toh(s->noiob); + s->npwg = le16toh(s->npwg); + s->npwa = le16toh(s->npwa); + s->npdg = le16toh(s->npdg); + s->npda = le16toh(s->npda); + s->nows = le16toh(s->nows); + s->anagrpid = le32toh(s->anagrpid); + s->nvmsetid = le16toh(s->nvmsetid); + s->endgid = le16toh(s->endgid); for (i = 0; i < 16; i++) s->lbaf[i] = le32toh(s->lbaf[i]); } static inline void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) { s->error_count = le64toh(s->error_count); s->sqid = le16toh(s->sqid); s->cid = le16toh(s->cid); s->status = le16toh(s->status); s->error_location = le16toh(s->error_location); s->lba = le64toh(s->lba); s->nsid = le32toh(s->nsid); } static inline void nvme_le128toh(void *p) { #if _BYTE_ORDER != _LITTLE_ENDIAN /* Swap 16 bytes in place */ char *tmp = (char*)p; char b; int i; for (i = 0; i < 8; i++) { b = tmp[i]; tmp[i] = tmp[15-i]; tmp[15-i] = b; } #else (void)p; #endif } static inline void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) { int i; s->temperature = le16toh(s->temperature); nvme_le128toh((void *)s->data_units_read); nvme_le128toh((void *)s->data_units_written); nvme_le128toh((void *)s->host_read_commands); nvme_le128toh((void *)s->host_write_commands); nvme_le128toh((void *)s->controller_busy_time); nvme_le128toh((void *)s->power_cycles); nvme_le128toh((void *)s->power_on_hours); nvme_le128toh((void *)s->unsafe_shutdowns); nvme_le128toh((void *)s->media_errors); nvme_le128toh((void *)s->num_error_info_log_entries); s->warning_temp_time = le32toh(s->warning_temp_time); s->error_temp_time = le32toh(s->error_temp_time); for (i = 0; i < 8; i++) s->temp_sensor[i] = le16toh(s->temp_sensor[i]); } static inline void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) { int i; for (i = 0; i < 7; i++) s->revision[i] = le64toh(s->revision[i]); } static inline void nvme_ns_list_swapbytes(struct nvme_ns_list *s) { int i; for (i = 0; i < 1024; i++) s->ns[i] = le32toh(s->ns[i]); } static inline void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) { s->current = le64toh(s->current); s->overtemp_flag_last = le64toh(s->overtemp_flag_last); s->overtemp_flag_life = le64toh(s->overtemp_flag_life); s->max_temp = le64toh(s->max_temp); s->min_temp = le64toh(s->min_temp); /* omit _rsvd[] */ s->max_oper_temp = le64toh(s->max_oper_temp); s->min_oper_temp = le64toh(s->min_oper_temp); s->est_offset = le64toh(s->est_offset); } #endif /* __NVME_H__ */ Index: projects/fuse2/sys/dev/usb/net/if_urndis.c =================================================================== --- projects/fuse2/sys/dev/usb/net/if_urndis.c (revision 350434) +++ projects/fuse2/sys/dev/usb/net/if_urndis.c (revision 350435) @@ -1,1055 +1,1058 @@ /* $OpenBSD: if_urndis.c,v 1.46 2013/12/09 15:45:29 pirofti Exp $ */ /* * Copyright (c) 2010 Jonathan Armani * Copyright (c) 2010 Fabien Romano * Copyright (c) 2010 Michael Knudsen * Copyright (c) 2014 Hans Petter Selasky * All rights reserved. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "usbdevs.h" #define USB_DEBUG_VAR urndis_debug #include #include #include "usb_if.h" #include #include #include static device_probe_t urndis_probe; static device_attach_t urndis_attach; static device_detach_t urndis_detach; static device_suspend_t urndis_suspend; static device_resume_t urndis_resume; static usb_callback_t urndis_bulk_write_callback; static usb_callback_t urndis_bulk_read_callback; static usb_callback_t urndis_intr_read_callback; static uether_fn_t urndis_attach_post; static uether_fn_t urndis_init; static uether_fn_t urndis_stop; static uether_fn_t urndis_start; static uether_fn_t urndis_setmulti; static uether_fn_t urndis_setpromisc; static uint32_t urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid, struct rndis_query_req *msg, uint16_t len, const void **rbuf, uint16_t *rbufsz); static uint32_t urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid, struct rndis_set_req *msg, uint16_t len); static uint32_t urndis_ctrl_handle_init(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr); static uint32_t urndis_ctrl_handle_query(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz); static uint32_t urndis_ctrl_handle_reset(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr); static uint32_t urndis_ctrl_init(struct urndis_softc *sc); static uint32_t urndis_ctrl_halt(struct urndis_softc *sc); #ifdef USB_DEBUG static int urndis_debug = 0; static SYSCTL_NODE(_hw_usb, OID_AUTO, urndis, CTLFLAG_RW, 0, "USB RNDIS-Ethernet"); SYSCTL_INT(_hw_usb_urndis, OID_AUTO, debug, CTLFLAG_RWTUN, &urndis_debug, 0, "Debug level"); #endif static const struct usb_config urndis_config[URNDIS_N_TRANSFER] = { [URNDIS_BULK_RX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_RX, .if_index = 0, .frames = 1, .bufsize = RNDIS_RX_MAXLEN, .flags = {.short_xfer_ok = 1,}, .callback = urndis_bulk_read_callback, .timeout = 0, /* no timeout */ .usb_mode = USB_MODE_HOST, }, [URNDIS_BULK_TX] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_TX, .if_index = 0, .frames = RNDIS_TX_FRAMES_MAX, .bufsize = (RNDIS_TX_FRAMES_MAX * RNDIS_TX_MAXLEN), .flags = { .force_short_xfer = 1, }, .callback = urndis_bulk_write_callback, .timeout = 10000, /* 10 seconds */ .usb_mode = USB_MODE_HOST, }, [URNDIS_INTR_RX] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_RX, .if_index = 1, .bufsize = 0, /* use wMaxPacketSize */ .flags = {.short_xfer_ok = 1,.no_pipe_ok = 1,}, .callback = urndis_intr_read_callback, .timeout = 0, .usb_mode = USB_MODE_HOST, }, }; static device_method_t urndis_methods[] = { /* Device interface */ DEVMETHOD(device_probe, urndis_probe), DEVMETHOD(device_attach, urndis_attach), DEVMETHOD(device_detach, urndis_detach), DEVMETHOD(device_suspend, urndis_suspend), DEVMETHOD(device_resume, urndis_resume), DEVMETHOD_END }; static driver_t urndis_driver = { .name = "urndis", .methods = urndis_methods, .size = sizeof(struct urndis_softc), }; static devclass_t urndis_devclass; static const STRUCT_USB_HOST_ID urndis_host_devs[] = { /* Generic RNDIS class match */ {USB_IFACE_CLASS(UICLASS_CDC), USB_IFACE_SUBCLASS(UISUBCLASS_ABSTRACT_CONTROL_MODEL), USB_IFACE_PROTOCOL(0xff)}, {USB_IFACE_CLASS(UICLASS_WIRELESS), USB_IFACE_SUBCLASS(UISUBCLASS_RF), USB_IFACE_PROTOCOL(UIPROTO_RNDIS)}, {USB_IFACE_CLASS(UICLASS_IAD), USB_IFACE_SUBCLASS(UISUBCLASS_SYNC), USB_IFACE_PROTOCOL(UIPROTO_ACTIVESYNC)}, /* HP-WebOS */ {USB_VENDOR(USB_VENDOR_PALM), USB_IFACE_CLASS(UICLASS_CDC), USB_IFACE_SUBCLASS(UISUBCLASS_ABSTRACT_CONTROL_MODEL), USB_IFACE_PROTOCOL(0xff)}, + /* Nokia 7 plus */ + {USB_IFACE_CLASS(UICLASS_IAD), USB_IFACE_SUBCLASS(0x4), + USB_IFACE_PROTOCOL(UIPROTO_ACTIVESYNC)}, }; DRIVER_MODULE(urndis, uhub, urndis_driver, urndis_devclass, NULL, NULL); MODULE_VERSION(urndis, 1); MODULE_DEPEND(urndis, uether, 1, 1, 1); MODULE_DEPEND(urndis, usb, 1, 1, 1); MODULE_DEPEND(urndis, ether, 1, 1, 1); USB_PNP_HOST_INFO(urndis_host_devs); static const struct usb_ether_methods urndis_ue_methods = { .ue_attach_post = urndis_attach_post, .ue_start = urndis_start, .ue_init = urndis_init, .ue_stop = urndis_stop, .ue_setmulti = urndis_setmulti, .ue_setpromisc = urndis_setpromisc, }; static int urndis_probe(device_t dev) { struct usb_attach_arg *uaa = device_get_ivars(dev); return (usbd_lookup_id_by_uaa(urndis_host_devs, sizeof(urndis_host_devs), uaa)); } static void urndis_attach_post(struct usb_ether *ue) { /* no-op */ } static int urndis_attach(device_t dev) { static struct { union { struct rndis_query_req query; struct rndis_set_req set; } hdr; union { uint8_t eaddr[ETHER_ADDR_LEN]; uint32_t filter; } ibuf; } msg; struct urndis_softc *sc = device_get_softc(dev); struct usb_ether *ue = &sc->sc_ue; struct usb_attach_arg *uaa = device_get_ivars(dev); struct usb_cdc_cm_descriptor *cmd; const void *buf; uint16_t bufsz; uint8_t iface_index[2] = { uaa->info.bIfaceIndex + 1, uaa->info.bIfaceIndex }; int error; uint8_t i; sc->sc_ue.ue_udev = uaa->device; sc->sc_ifaceno_ctl = uaa->info.bIfaceNum; cmd = usbd_find_descriptor(uaa->device, NULL, uaa->info.bIfaceIndex, UDESC_CS_INTERFACE, 0xFF, UDESCSUB_CDC_CM, 0xFF); if (cmd != NULL) { DPRINTF("Call Mode Descriptor found, dataif=%d\n", cmd->bDataInterface); iface_index[0] = cmd->bDataInterface; } device_set_usb_desc(dev); mtx_init(&sc->sc_mtx, device_get_nameunit(dev), NULL, MTX_DEF); /* scan the alternate settings looking for a valid one */ for (i = 0; i != 32; i++) { error = usbd_set_alt_interface_index(uaa->device, iface_index[0], i); if (error != 0) break; error = usbd_transfer_setup(uaa->device, iface_index, sc->sc_xfer, urndis_config, URNDIS_N_TRANSFER, sc, &sc->sc_mtx); if (error == 0) break; } if ((error != 0) || (i == 32)) { device_printf(dev, "No valid alternate setting found\n"); goto detach; } /* Initialize device - must be done before even querying it */ URNDIS_LOCK(sc); error = urndis_ctrl_init(sc); URNDIS_UNLOCK(sc); if (error != (int)RNDIS_STATUS_SUCCESS) { device_printf(dev, "Unable to initialize hardware\n"); goto detach; } /* Determine MAC address */ memset(msg.ibuf.eaddr, 0, sizeof(msg.ibuf.eaddr)); URNDIS_LOCK(sc); error = urndis_ctrl_query(sc, OID_802_3_PERMANENT_ADDRESS, &msg.hdr.query, sizeof(msg.hdr.query) + sizeof(msg.ibuf.eaddr), &buf, &bufsz); URNDIS_UNLOCK(sc); if (error != (int)RNDIS_STATUS_SUCCESS) { device_printf(dev, "Unable to get hardware address\n"); goto detach; } if (bufsz != ETHER_ADDR_LEN) { device_printf(dev, "Invalid address length: %d bytes\n", bufsz); goto detach; } memcpy(&sc->sc_ue.ue_eaddr, buf, ETHER_ADDR_LEN); /* Initialize packet filter */ sc->sc_filter = NDIS_PACKET_TYPE_BROADCAST | NDIS_PACKET_TYPE_ALL_MULTICAST; msg.ibuf.filter = htole32(sc->sc_filter); URNDIS_LOCK(sc); error = urndis_ctrl_set(sc, OID_GEN_CURRENT_PACKET_FILTER, &msg.hdr.set, sizeof(msg.hdr.set) + sizeof(msg.ibuf.filter)); URNDIS_UNLOCK(sc); if (error != (int)RNDIS_STATUS_SUCCESS) { device_printf(dev, "Unable to set data filters\n"); goto detach; } ue->ue_sc = sc; ue->ue_dev = dev; ue->ue_udev = uaa->device; ue->ue_mtx = &sc->sc_mtx; ue->ue_methods = &urndis_ue_methods; error = uether_ifattach(ue); if (error) { device_printf(dev, "Could not attach interface\n"); goto detach; } URNDIS_LOCK(sc); /* start interrupt endpoint, if any */ usbd_transfer_start(sc->sc_xfer[URNDIS_INTR_RX]); URNDIS_UNLOCK(sc); return (0); /* success */ detach: (void)urndis_detach(dev); return (ENXIO); /* failure */ } static int urndis_detach(device_t dev) { struct urndis_softc *sc = device_get_softc(dev); struct usb_ether *ue = &sc->sc_ue; /* stop all USB transfers first */ usbd_transfer_unsetup(sc->sc_xfer, URNDIS_N_TRANSFER); uether_ifdetach(ue); URNDIS_LOCK(sc); (void)urndis_ctrl_halt(sc); URNDIS_UNLOCK(sc); mtx_destroy(&sc->sc_mtx); return (0); } static void urndis_start(struct usb_ether *ue) { struct urndis_softc *sc = uether_getsc(ue); /* * Start the USB transfers, if not already started: */ usbd_transfer_start(sc->sc_xfer[URNDIS_BULK_TX]); usbd_transfer_start(sc->sc_xfer[URNDIS_BULK_RX]); } static void urndis_init(struct usb_ether *ue) { struct urndis_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); URNDIS_LOCK_ASSERT(sc, MA_OWNED); ifp->if_drv_flags |= IFF_DRV_RUNNING; /* stall data write direction, which depends on USB mode */ usbd_xfer_set_stall(sc->sc_xfer[URNDIS_BULK_TX]); /* start data transfers */ urndis_start(ue); } static void urndis_stop(struct usb_ether *ue) { struct urndis_softc *sc = uether_getsc(ue); struct ifnet *ifp = uether_getifp(ue); URNDIS_LOCK_ASSERT(sc, MA_OWNED); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * stop all the transfers, if not already stopped: */ usbd_transfer_stop(sc->sc_xfer[URNDIS_BULK_RX]); usbd_transfer_stop(sc->sc_xfer[URNDIS_BULK_TX]); } static void urndis_setmulti(struct usb_ether *ue) { /* no-op */ } static void urndis_setpromisc(struct usb_ether *ue) { /* no-op */ } static int urndis_suspend(device_t dev) { device_printf(dev, "Suspending\n"); return (0); } static int urndis_resume(device_t dev) { device_printf(dev, "Resuming\n"); return (0); } static usb_error_t urndis_ctrl_msg(struct urndis_softc *sc, uint8_t rt, uint8_t r, uint16_t index, uint16_t value, void *buf, uint16_t buflen) { usb_device_request_t req; req.bmRequestType = rt; req.bRequest = r; USETW(req.wValue, value); USETW(req.wIndex, index); USETW(req.wLength, buflen); return (usbd_do_request_flags(sc->sc_ue.ue_udev, &sc->sc_mtx, &req, buf, (rt & UT_READ) ? USB_SHORT_XFER_OK : 0, NULL, 2000 /* ms */ )); } static usb_error_t urndis_ctrl_send(struct urndis_softc *sc, void *buf, uint16_t len) { usb_error_t err; err = urndis_ctrl_msg(sc, UT_WRITE_CLASS_INTERFACE, UCDC_SEND_ENCAPSULATED_COMMAND, sc->sc_ifaceno_ctl, 0, buf, len); DPRINTF("%s\n", usbd_errstr(err)); return (err); } static struct rndis_comp_hdr * urndis_ctrl_recv(struct urndis_softc *sc) { struct rndis_comp_hdr *hdr; usb_error_t err; err = urndis_ctrl_msg(sc, UT_READ_CLASS_INTERFACE, UCDC_GET_ENCAPSULATED_RESPONSE, sc->sc_ifaceno_ctl, 0, sc->sc_response_buf, RNDIS_RESPONSE_LEN); if (err != USB_ERR_NORMAL_COMPLETION) return (NULL); hdr = (struct rndis_comp_hdr *)sc->sc_response_buf; DPRINTF("type 0x%x len %u\n", le32toh(hdr->rm_type), le32toh(hdr->rm_len)); if (le32toh(hdr->rm_len) > RNDIS_RESPONSE_LEN) { DPRINTF("ctrl message error: wrong size %u > %u\n", le32toh(hdr->rm_len), RNDIS_RESPONSE_LEN); return (NULL); } return (hdr); } static uint32_t urndis_ctrl_handle(struct urndis_softc *sc, struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz) { uint32_t rval; DPRINTF("\n"); if (buf != NULL && bufsz != NULL) { *buf = NULL; *bufsz = 0; } switch (le32toh(hdr->rm_type)) { case REMOTE_NDIS_INITIALIZE_CMPLT: rval = urndis_ctrl_handle_init(sc, hdr); break; case REMOTE_NDIS_QUERY_CMPLT: rval = urndis_ctrl_handle_query(sc, hdr, buf, bufsz); break; case REMOTE_NDIS_RESET_CMPLT: rval = urndis_ctrl_handle_reset(sc, hdr); break; case REMOTE_NDIS_KEEPALIVE_CMPLT: case REMOTE_NDIS_SET_CMPLT: rval = le32toh(hdr->rm_status); break; default: device_printf(sc->sc_ue.ue_dev, "ctrl message error: unknown event 0x%x\n", le32toh(hdr->rm_type)); rval = RNDIS_STATUS_FAILURE; break; } return (rval); } static uint32_t urndis_ctrl_handle_init(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr) { const struct rndis_init_comp *msg; msg = (const struct rndis_init_comp *)hdr; DPRINTF("len %u rid %u status 0x%x " "ver_major %u ver_minor %u devflags 0x%x medium 0x%x pktmaxcnt %u " "pktmaxsz %u align %u aflistoffset %u aflistsz %u\n", le32toh(msg->rm_len), le32toh(msg->rm_rid), le32toh(msg->rm_status), le32toh(msg->rm_ver_major), le32toh(msg->rm_ver_minor), le32toh(msg->rm_devflags), le32toh(msg->rm_medium), le32toh(msg->rm_pktmaxcnt), le32toh(msg->rm_pktmaxsz), le32toh(msg->rm_align), le32toh(msg->rm_aflistoffset), le32toh(msg->rm_aflistsz)); if (le32toh(msg->rm_status) != RNDIS_STATUS_SUCCESS) { DPRINTF("init failed 0x%x\n", le32toh(msg->rm_status)); return (le32toh(msg->rm_status)); } if (le32toh(msg->rm_devflags) != RNDIS_DF_CONNECTIONLESS) { DPRINTF("wrong device type (current type: 0x%x)\n", le32toh(msg->rm_devflags)); return (RNDIS_STATUS_FAILURE); } if (le32toh(msg->rm_medium) != RNDIS_MEDIUM_802_3) { DPRINTF("medium not 802.3 (current medium: 0x%x)\n", le32toh(msg->rm_medium)); return (RNDIS_STATUS_FAILURE); } sc->sc_lim_pktsz = le32toh(msg->rm_pktmaxsz); return (le32toh(msg->rm_status)); } static uint32_t urndis_ctrl_handle_query(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz) { const struct rndis_query_comp *msg; uint64_t limit; msg = (const struct rndis_query_comp *)hdr; DPRINTF("len %u rid %u status 0x%x " "buflen %u bufoff %u\n", le32toh(msg->rm_len), le32toh(msg->rm_rid), le32toh(msg->rm_status), le32toh(msg->rm_infobuflen), le32toh(msg->rm_infobufoffset)); *buf = NULL; *bufsz = 0; if (le32toh(msg->rm_status) != RNDIS_STATUS_SUCCESS) { DPRINTF("query failed 0x%x\n", le32toh(msg->rm_status)); return (le32toh(msg->rm_status)); } limit = le32toh(msg->rm_infobuflen); limit += le32toh(msg->rm_infobufoffset); limit += RNDIS_HEADER_OFFSET; if (limit > (uint64_t)le32toh(msg->rm_len)) { DPRINTF("ctrl message error: invalid query info " "len/offset/end_position(%u/%u/%u) -> " "go out of buffer limit %u\n", le32toh(msg->rm_infobuflen), le32toh(msg->rm_infobufoffset), le32toh(msg->rm_infobuflen) + le32toh(msg->rm_infobufoffset) + RNDIS_HEADER_OFFSET, le32toh(msg->rm_len)); return (RNDIS_STATUS_FAILURE); } *buf = ((const uint8_t *)msg) + RNDIS_HEADER_OFFSET + le32toh(msg->rm_infobufoffset); *bufsz = le32toh(msg->rm_infobuflen); return (le32toh(msg->rm_status)); } static uint32_t urndis_ctrl_handle_reset(struct urndis_softc *sc, const struct rndis_comp_hdr *hdr) { const struct rndis_reset_comp *msg; uint32_t rval; msg = (const struct rndis_reset_comp *)hdr; rval = le32toh(msg->rm_status); DPRINTF("len %u status 0x%x " "adrreset %u\n", le32toh(msg->rm_len), rval, le32toh(msg->rm_adrreset)); if (rval != RNDIS_STATUS_SUCCESS) { DPRINTF("reset failed 0x%x\n", rval); return (rval); } if (msg->rm_adrreset != 0) { struct { struct rndis_set_req hdr; uint32_t filter; } msg_filter; msg_filter.filter = htole32(sc->sc_filter); rval = urndis_ctrl_set(sc, OID_GEN_CURRENT_PACKET_FILTER, &msg_filter.hdr, sizeof(msg_filter)); if (rval != RNDIS_STATUS_SUCCESS) { DPRINTF("unable to reset data filters\n"); return (rval); } } return (rval); } static uint32_t urndis_ctrl_init(struct urndis_softc *sc) { struct rndis_init_req msg; struct rndis_comp_hdr *hdr; uint32_t rval; msg.rm_type = htole32(REMOTE_NDIS_INITIALIZE_MSG); msg.rm_len = htole32(sizeof(msg)); msg.rm_rid = 0; msg.rm_ver_major = htole32(RNDIS_VERSION_MAJOR); msg.rm_ver_minor = htole32(1); msg.rm_max_xfersz = htole32(RNDIS_RX_MAXLEN); DPRINTF("type %u len %u rid %u ver_major %u " "ver_minor %u max_xfersz %u\n", le32toh(msg.rm_type), le32toh(msg.rm_len), le32toh(msg.rm_rid), le32toh(msg.rm_ver_major), le32toh(msg.rm_ver_minor), le32toh(msg.rm_max_xfersz)); rval = urndis_ctrl_send(sc, &msg, sizeof(msg)); if (rval != RNDIS_STATUS_SUCCESS) { DPRINTF("init failed\n"); return (rval); } if ((hdr = urndis_ctrl_recv(sc)) == NULL) { DPRINTF("unable to get init response\n"); return (RNDIS_STATUS_FAILURE); } rval = urndis_ctrl_handle(sc, hdr, NULL, NULL); return (rval); } static uint32_t urndis_ctrl_halt(struct urndis_softc *sc) { struct rndis_halt_req msg; uint32_t rval; msg.rm_type = htole32(REMOTE_NDIS_HALT_MSG); msg.rm_len = htole32(sizeof(msg)); msg.rm_rid = 0; DPRINTF("type %u len %u rid %u\n", le32toh(msg.rm_type), le32toh(msg.rm_len), le32toh(msg.rm_rid)); rval = urndis_ctrl_send(sc, &msg, sizeof(msg)); if (rval != RNDIS_STATUS_SUCCESS) DPRINTF("halt failed\n"); return (rval); } /* * NB: Querying a device has the requirement of using an input buffer the size * of the expected reply or larger, except for variably sized replies. */ static uint32_t urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid, struct rndis_query_req *msg, uint16_t len, const void **rbuf, uint16_t *rbufsz) { struct rndis_comp_hdr *hdr; uint32_t datalen, rval; msg->rm_type = htole32(REMOTE_NDIS_QUERY_MSG); msg->rm_len = htole32(len); msg->rm_rid = 0; /* XXX */ msg->rm_oid = htole32(oid); datalen = len - sizeof(*msg); msg->rm_infobuflen = htole32(datalen); if (datalen != 0) { msg->rm_infobufoffset = htole32(sizeof(*msg) - RNDIS_HEADER_OFFSET); } else { msg->rm_infobufoffset = 0; } msg->rm_devicevchdl = 0; DPRINTF("type %u len %u rid %u oid 0x%x " "infobuflen %u infobufoffset %u devicevchdl %u\n", le32toh(msg->rm_type), le32toh(msg->rm_len), le32toh(msg->rm_rid), le32toh(msg->rm_oid), le32toh(msg->rm_infobuflen), le32toh(msg->rm_infobufoffset), le32toh(msg->rm_devicevchdl)); rval = urndis_ctrl_send(sc, msg, len); if (rval != RNDIS_STATUS_SUCCESS) { DPRINTF("query failed\n"); return (rval); } if ((hdr = urndis_ctrl_recv(sc)) == NULL) { DPRINTF("unable to get query response\n"); return (RNDIS_STATUS_FAILURE); } rval = urndis_ctrl_handle(sc, hdr, rbuf, rbufsz); return (rval); } static uint32_t urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid, struct rndis_set_req *msg, uint16_t len) { struct rndis_comp_hdr *hdr; uint32_t datalen, rval; msg->rm_type = htole32(REMOTE_NDIS_SET_MSG); msg->rm_len = htole32(len); msg->rm_rid = 0; /* XXX */ msg->rm_oid = htole32(oid); datalen = len - sizeof(*msg); msg->rm_infobuflen = htole32(datalen); if (datalen != 0) { msg->rm_infobufoffset = htole32(sizeof(*msg) - RNDIS_HEADER_OFFSET); } else { msg->rm_infobufoffset = 0; } msg->rm_devicevchdl = 0; DPRINTF("type %u len %u rid %u oid 0x%x " "infobuflen %u infobufoffset %u devicevchdl %u\n", le32toh(msg->rm_type), le32toh(msg->rm_len), le32toh(msg->rm_rid), le32toh(msg->rm_oid), le32toh(msg->rm_infobuflen), le32toh(msg->rm_infobufoffset), le32toh(msg->rm_devicevchdl)); rval = urndis_ctrl_send(sc, msg, len); if (rval != RNDIS_STATUS_SUCCESS) { DPRINTF("set failed\n"); return (rval); } if ((hdr = urndis_ctrl_recv(sc)) == NULL) { DPRINTF("unable to get set response\n"); return (RNDIS_STATUS_FAILURE); } rval = urndis_ctrl_handle(sc, hdr, NULL, NULL); if (rval != RNDIS_STATUS_SUCCESS) DPRINTF("set failed 0x%x\n", rval); return (rval); } static void urndis_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct urndis_softc *sc = usbd_xfer_softc(xfer); struct usb_page_cache *pc = usbd_xfer_get_frame(xfer, 0); struct ifnet *ifp = uether_getifp(&sc->sc_ue); struct rndis_packet_msg msg; struct mbuf *m; int actlen; int aframes; int offset; switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: usbd_xfer_status(xfer, &actlen, NULL, &aframes, NULL); DPRINTFN(1, "received %u bytes in %u frames\n", actlen, aframes); for (offset = 0; actlen >= (uint32_t)sizeof(msg);) { /* copy out header */ usbd_copy_out(pc, offset, &msg, sizeof(msg)); if (le32toh(0x1234567U) != 0x1234567U) { /* swap endianness */ msg.rm_type = le32toh(msg.rm_type); msg.rm_len = le32toh(msg.rm_len); msg.rm_dataoffset = le32toh(msg.rm_dataoffset); msg.rm_datalen = le32toh(msg.rm_datalen); msg.rm_oobdataoffset = le32toh(msg.rm_oobdataoffset); msg.rm_oobdatalen = le32toh(msg.rm_oobdatalen); msg.rm_oobdataelements = le32toh(msg.rm_oobdataelements); msg.rm_pktinfooffset = le32toh(msg.rm_pktinfooffset); msg.rm_pktinfolen = le32toh(msg.rm_pktinfolen); msg.rm_vchandle = le32toh(msg.rm_vchandle); msg.rm_reserved = le32toh(msg.rm_reserved); } DPRINTF("len %u data(off:%u len:%u) " "oobdata(off:%u len:%u nb:%u) perpacket(off:%u len:%u)\n", msg.rm_len, msg.rm_dataoffset, msg.rm_datalen, msg.rm_oobdataoffset, msg.rm_oobdatalen, msg.rm_oobdataelements, msg.rm_pktinfooffset, msg.rm_pktinfooffset); /* sanity check the RNDIS header */ if (msg.rm_type != REMOTE_NDIS_PACKET_MSG) { DPRINTF("invalid type 0x%x != 0x%x\n", msg.rm_type, REMOTE_NDIS_PACKET_MSG); goto tr_setup; } else if (msg.rm_len < (uint32_t)sizeof(msg)) { DPRINTF("invalid msg len %u < %u\n", msg.rm_len, (unsigned)sizeof(msg)); goto tr_setup; } else if (msg.rm_len > (uint32_t)actlen) { DPRINTF("invalid msg len %u > buffer " "len %u\n", msg.rm_len, actlen); goto tr_setup; } else if (msg.rm_dataoffset >= (uint32_t)actlen) { DPRINTF("invalid msg dataoffset %u > buffer " "dataoffset %u\n", msg.rm_dataoffset, actlen); goto tr_setup; } else if (msg.rm_datalen > (uint32_t)actlen) { DPRINTF("invalid msg datalen %u > buffer " "datalen %u\n", msg.rm_datalen, actlen); goto tr_setup; } else if ((msg.rm_dataoffset + msg.rm_datalen + (uint32_t)__offsetof(struct rndis_packet_msg, rm_dataoffset)) > (uint32_t)actlen) { DPRINTF("invalid dataoffset %u larger than %u\n", msg.rm_dataoffset + msg.rm_datalen + (uint32_t)__offsetof(struct rndis_packet_msg, rm_dataoffset), actlen); goto tr_setup; } else if (msg.rm_datalen < (uint32_t)sizeof(struct ether_header)) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); DPRINTF("invalid ethernet size " "%u < %u\n", msg.rm_datalen, (unsigned)sizeof(struct ether_header)); goto tr_setup; } else if (msg.rm_datalen > (uint32_t)(MCLBYTES - ETHER_ALIGN)) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); DPRINTF("invalid ethernet size " "%u > %u\n", msg.rm_datalen, (unsigned)MCLBYTES); goto tr_setup; } else if (msg.rm_datalen > (uint32_t)(MHLEN - ETHER_ALIGN)) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); } else { m = m_gethdr(M_NOWAIT, MT_DATA); } /* check if we have a buffer */ if (m != NULL) { m->m_len = m->m_pkthdr.len = msg.rm_datalen + ETHER_ALIGN; m_adj(m, ETHER_ALIGN); usbd_copy_out(pc, offset + msg.rm_dataoffset + __offsetof(struct rndis_packet_msg, rm_dataoffset), m->m_data, msg.rm_datalen); /* enqueue */ uether_rxmbuf(&sc->sc_ue, m, msg.rm_datalen); } else { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } offset += msg.rm_len; actlen -= msg.rm_len; } case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, RNDIS_RX_MAXLEN); usbd_xfer_set_frames(xfer, 1); usbd_transfer_submit(xfer); uether_rxflush(&sc->sc_ue); /* must be last */ break; default: /* Error */ DPRINTFN(1, "error = %s\n", usbd_errstr(error)); if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); usbd_xfer_set_frames(xfer, 0); usbd_transfer_submit(xfer); } break; } } static void urndis_bulk_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct rndis_packet_msg msg; struct urndis_softc *sc = usbd_xfer_softc(xfer); struct ifnet *ifp = uether_getifp(&sc->sc_ue); struct mbuf *m; unsigned x; int actlen; int aframes; usbd_xfer_status(xfer, &actlen, NULL, &aframes, NULL); DPRINTFN(1, "\n"); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTFN(11, "%u bytes in %u frames\n", actlen, aframes); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: memset(&msg, 0, sizeof(msg)); for (x = 0; x != RNDIS_TX_FRAMES_MAX; x++) { struct usb_page_cache *pc = usbd_xfer_get_frame(xfer, x); usbd_xfer_set_frame_offset(xfer, x * RNDIS_TX_MAXLEN, x); next_pkt: IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; if ((m->m_pkthdr.len + sizeof(msg)) > RNDIS_TX_MAXLEN) { DPRINTF("Too big packet\n"); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* Free buffer */ m_freem(m); goto next_pkt; } msg.rm_type = htole32(REMOTE_NDIS_PACKET_MSG); msg.rm_len = htole32(sizeof(msg) + m->m_pkthdr.len); msg.rm_dataoffset = htole32(RNDIS_DATA_OFFSET); msg.rm_datalen = htole32(m->m_pkthdr.len); /* copy in all data */ usbd_copy_in(pc, 0, &msg, sizeof(msg)); usbd_m_copy_in(pc, sizeof(msg), m, 0, m->m_pkthdr.len); usbd_xfer_set_frame_len(xfer, x, sizeof(msg) + m->m_pkthdr.len); /* * If there's a BPF listener, bounce a copy of * this frame to him: */ BPF_MTAP(ifp, m); /* Free buffer */ m_freem(m); } if (x != 0) { usbd_xfer_set_frames(xfer, x); usbd_transfer_submit(xfer); } break; default: /* Error */ DPRINTFN(11, "transfer error, %s\n", usbd_errstr(error)); /* count output errors */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void urndis_intr_read_callback(struct usb_xfer *xfer, usb_error_t error) { int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: DPRINTF("Received %d bytes\n", actlen); /* TODO: decode some indications */ /* FALLTHROUGH */ case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: /* Error */ if (error != USB_ERR_CANCELLED) { /* start clear stall */ usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } Index: projects/fuse2/sys/fs/devfs/devfs_vnops.c =================================================================== --- projects/fuse2/sys/fs/devfs/devfs_vnops.c (revision 350434) +++ projects/fuse2/sys/fs/devfs/devfs_vnops.c (revision 350435) @@ -1,1995 +1,1996 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp, 0); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } if ((vp->v_iflag & VI_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; int *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); i = *buflen; dd = vp->v_data; if (vp->v_type == VCHR) { i -= strlen(dd->de_cdp->cdp_c.si_name); if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_cdp->cdp_c.si_name, buf + i, strlen(dd->de_cdp->cdp_c.si_name)); de = dd->de_dir; } else if (vp->v_type == VDIR) { if (dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); de = dd; } else { error = ENOENT; goto finished; } *buflen = i; de = devfs_parent_dirent(de); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if ((vp->v_iflag & VI_DOOMED) != 0) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred, NULL); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } if (count_dev(dev) == 1) dflags |= FLASTCLOSE; vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; struct mtx *mtxp; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); mtxp = mtx_pool_find(mtxpool_sleep, fp); /* * Hint to the dofilewrite() to not force the buffer draining * on the writer to the file. Most likely, the write would * not need normal buffers. */ mtx_lock(mtxp); fp->f_vnread_flags |= FDEVFS_VNODE; mtx_unlock(mtxp); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vnode_destroy_vobject(vp); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); devfs_reclaim(ap); VI_LOCK(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev != NULL) dev->si_usecount -= vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp, 0); if (dvp != vp) VOP_UNLOCK(dvp, 0); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp, 0); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, }; /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); Index: projects/fuse2/sys/fs/ext2fs/ext2_vnops.c =================================================================== --- projects/fuse2/sys/fs/ext2fs/ext2_vnops.c (revision 350434) +++ projects/fuse2/sys/fs/ext2fs/ext2_vnops.c (revision 350435) @@ -1,2346 +1,2347 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.7 (Berkeley) 2/3/94 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_suiddir.h" #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*"); static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static void ext2_itimes_locked(struct vnode *); static vop_access_t ext2_access; static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ext2_close; static vop_create_t ext2_create; static vop_fsync_t ext2_fsync; static vop_getattr_t ext2_getattr; static vop_ioctl_t ext2_ioctl; static vop_link_t ext2_link; static vop_mkdir_t ext2_mkdir; static vop_mknod_t ext2_mknod; static vop_open_t ext2_open; static vop_pathconf_t ext2_pathconf; static vop_print_t ext2_print; static vop_read_t ext2_read; static vop_readlink_t ext2_readlink; static vop_remove_t ext2_remove; static vop_rename_t ext2_rename; static vop_rmdir_t ext2_rmdir; static vop_setattr_t ext2_setattr; static vop_strategy_t ext2_strategy; static vop_symlink_t ext2_symlink; static vop_write_t ext2_write; static vop_deleteextattr_t ext2_deleteextattr; static vop_getextattr_t ext2_getextattr; static vop_listextattr_t ext2_listextattr; static vop_setextattr_t ext2_setextattr; static vop_vptofh_t ext2_vptofh; static vop_close_t ext2fifo_close; static vop_kqfilter_t ext2fifo_kqfilter; /* Global vfs data structures for ext2. */ struct vop_vector ext2_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ext2_access, .vop_bmap = ext2_bmap, .vop_cachedlookup = ext2_lookup, .vop_close = ext2_close, .vop_create = ext2_create, .vop_fsync = ext2_fsync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_ioctl = ext2_ioctl, .vop_link = ext2_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ext2_mkdir, .vop_mknod = ext2_mknod, .vop_open = ext2_open, .vop_pathconf = ext2_pathconf, .vop_poll = vop_stdpoll, .vop_print = ext2_print, .vop_read = ext2_read, .vop_readdir = ext2_readdir, .vop_readlink = ext2_readlink, .vop_reallocblks = ext2_reallocblks, .vop_reclaim = ext2_reclaim, .vop_remove = ext2_remove, .vop_rename = ext2_rename, .vop_rmdir = ext2_rmdir, .vop_setattr = ext2_setattr, .vop_strategy = ext2_strategy, .vop_symlink = ext2_symlink, .vop_write = ext2_write, .vop_deleteextattr = ext2_deleteextattr, .vop_getextattr = ext2_getextattr, .vop_listextattr = ext2_listextattr, .vop_setextattr = ext2_setextattr, #ifdef UFS_ACL .vop_getacl = ext2_getacl, .vop_setacl = ext2_setacl, .vop_aclcheck = ext2_aclcheck, #endif /* UFS_ACL */ .vop_vptofh = ext2_vptofh, }; struct vop_vector ext2_fifoops = { .vop_default = &fifo_specops, .vop_access = ext2_access, .vop_close = ext2fifo_close, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_kqfilter = ext2fifo_kqfilter, .vop_pathconf = ext2_pathconf, .vop_print = ext2_print, .vop_read = VOP_PANIC, .vop_reclaim = ext2_reclaim, .vop_setattr = ext2_setattr, .vop_write = VOP_PANIC, .vop_vptofh = ext2_vptofh, }; /* * A virgin directory (no blushing please). * Note that the type and namlen fields are reversed relative to ext2. * Also, we don't use `struct odirtemplate', since it would just cause * endianness problems. */ static struct dirtemplate mastertemplate = { 0, 12, 1, EXT2_FT_DIR, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".." }; static struct dirtemplate omastertemplate = { 0, 12, 1, EXT2_FT_UNKNOWN, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".." }; static void ext2_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ext2_itimes(struct vnode *vp) { VI_LOCK(vp); ext2_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ext2_create(struct vop_create_args *ap) { int error; error = ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } static int ext2_open(struct vop_open_args *ap) { if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ static int ext2_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ext2_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) return (EOPNOTSUPP); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT))) return (EPERM); error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } static int ext2_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; ext2_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_devvp->v_rdev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0; if E2DI_HAS_XTIME(ip) { vap->va_birthtime.tv_sec = ip->i_birthtime; vap->va_birthtime.tv_nsec = ip->i_birthnsec; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ext2_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { /* Disallow flags not supported by ext2fs. */ if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP)) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to unset system flags, or * modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; ip->i_flag |= IN_CHANGE; if (ip->i_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, td)))) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } ip->i_birthtime = vap->va_birthtime.tv_sec; ip->i_birthnsec = vap->va_birthtime.tv_nsec; error = ext2_update(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ext2_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { error = priv_check_cred(cred, PRIV_VFS_STICKYFILE); if (error) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if (uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN); if (error) return (error); } ogid = ip->i_gid; ouid = ip->i_uid; ip->i_gid = gid; ip->i_uid = uid; ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0) ip->i_mode &= ~(ISUID | ISGID); } return (0); } /* * Synch an open file. */ /* ARGSUSED */ static int ext2_fsync(struct vop_fsync_args *ap) { /* * Flush all dirty buffers associated with a vnode. */ vop_stdfsync(ap); return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT)); } /* * Mknod vnode call */ /* ARGSUSED */ static int ext2_mknod(struct vop_mknod_args *ap) { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ if (!(ip->i_flag & IN_E4EXTENTS)) ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } static int ext2_remove(struct vop_remove_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ext2_dirremove(dvp, ap->a_cnp); if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * link vnode call */ static int ext2_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_link: no name"); #endif ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; error = ext2_update(vp, !DOINGASYNC(vp)); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } static int ext2_inc_nlink(struct inode *ip) { ip->i_nlink++; if (S_ISDIR(ip->i_mode) && EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) && ip->i_nlink > 1) { if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2) ip->i_nlink = 1; } else if (ip->i_nlink > EXT4_LINK_MAX) { ip->i_nlink--; return (EMLINK); } return (0); } static void ext2_dec_nlink(struct inode *ip) { if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2) ip->i_nlink--; } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ext2_rename(struct vop_rename_args *ap) { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct inode *ip, *xp, *dp; struct dirtemplate *dirbuf; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ext2_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { SDT_PROBE2(ext2fs, , vnops, trace, 1, "rename: fvp == tvp (can't happen)"); error = 0; goto abortit; } if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { VOP_UNLOCK(fvp, 0); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ext2_inc_nlink(ip); ip->i_flag |= IN_CHANGE; if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) { VOP_UNLOCK(fvp, 0); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ext2_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { error = ext2_inc_nlink(dp); if (error) goto bad; dp->i_flag |= IN_CHANGE; error = ext2_update(tdvp, !DOINGASYNC(tdvp)); if (error) goto bad; } error = ext2_direnter(ip, tdvp, tcnp); if (error) { if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; (void)ext2_update(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ext2_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode & IFMT) == IFDIR) { if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ext2_dirrewrite(dp, ip, tcnp); if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ ext2_dec_nlink(xp); if (doingdirectory) { if (--xp->i_nlink != 0) panic("ext2_rename: linked directory"); error = ext2_truncate(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_thread); } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. IN_RENAME is not sufficient * to protect against directory races due to timing windows, * so we can't panic here. */ vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { /* * From name resolves to a different inode. IN_RENAME is * not sufficient protection against timing window races * so we can't panic here. */ } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO); if (!dirbuf) { error = ENOMEM; goto bad; } error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); if (error == 0) { /* Like ufs little-endian: */ namlen = dirbuf->dotdot_type; if (namlen != 2 || dirbuf->dotdot_name[0] != '.' || dirbuf->dotdot_name[1] != '.') { ext2_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf->dotdot_ino = newparent; /* * dirblock 0 could be htree root, * try both csum update functions. */ ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); ext2_dx_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); (void)vn_rdwr(UIO_WRITE, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); cache_purge(fdvp); } } free(dirbuf, M_TEMP); } error = ext2_dirremove(fdvp, fcnp); if (!error) { ext2_dec_nlink(xp); xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ext2_dec_nlink(ip); ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } #ifdef UFS_ACL static int ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; *dacl = *acl; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ #ifdef DEBUG printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); #endif /* DEBUG */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } #endif /* UFS_ACL */ /* * Mkdir system call */ static int ext2_mkdir(struct vop_mkdir_args *ap) { struct m_ext2fs *fs; struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; char *buf = NULL; int error, dmode; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ext2_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); fs = ip->i_e2fs; ip->i_gid = dp->i_gid; #ifdef SUIDDIR { /* * if we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; error = ext2_update(tvp, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ ext2_inc_nlink(dp); dp->i_flag |= IN_CHANGE; error = ext2_update(dvp, !DOINGASYNC(dvp)); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) dtp = &mastertemplate; else dtp = &omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; /* * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's * just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ VTOI(dvp)->i_e2fs->e2fs_bsize dirtemplate.dotdot_reclen = DIRBLKSIZ - 12; buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO); if (!buf) { error = ENOMEM; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { dirtemplate.dotdot_reclen -= sizeof(struct ext2fs_direct_tail); ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ)); } memcpy(buf, &dirtemplate, sizeof(dirtemplate)); ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf); error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf, DIRBLKSIZ, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED, NULL, NULL); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_mkdir: blksize"); else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ /* Directory set up, now install its entry in the parent directory. */ error = ext2_direnter(ip, dvp, cnp); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: free(buf, M_TEMP); return (error); #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE } /* * Rmdir system call. */ static int ext2_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ext2_dirremove(dvp, cnp); if (error) goto out; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; cache_purge(dvp); VOP_UNLOCK(dvp, 0); /* * Truncate inode. The only stuff left * in the directory is "." and "..". */ ip->i_nlink = 0; error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, cnp->cn_thread); cache_purge(ITOV(ip)); if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } /* * symlink -- make a symbolic link */ static int ext2_symlink(struct vop_symlink_args *ap) { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Return target name of a symbolic link */ static int ext2_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if (isize < vp->v_mount->mnt_maxsymlinklen) { uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ext2_bmaparray() operation may not * deadlock on memory. See ext2_bmap() for details. */ static int ext2_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct bufobj *bo; daddr_t blkno; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ext2_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS) error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL); else error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = VFSTOEXT2(vp->v_mount)->um_bo; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ext2_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ext2fifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ext2 kqfilter routines if needed */ static int ext2fifo_kqfilter(struct vop_kqfilter_args *ap) { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ext2 filesystems. */ static int ext2_pathconf(struct vop_pathconf_args *ap) { int error = 0; switch (ap->a_name) { case _PC_LINK_MAX: if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) *ap->a_retval = INT_MAX; else *ap->a_retval = EXT4_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_PATH_MAX: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; break; #endif /* UFS_ACL */ case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Vnode operation to remove a named attribute. */ static int ext2_deleteextattr(struct vop_deleteextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ext2_getextattr(struct vop_getextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ext2_listextattr(struct vop_listextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); if (error) return (error); } if (ip->i_facl) error = ext2_extattr_block_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to set a named attribute. */ static int ext2_setextattr(struct vop_setextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name); if (error) return (error); if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); if (error != ENOSPC) return (error); } error = ext2_extattr_block_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); return (error); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(struct vop_vptofh_args *ap) { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp) { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == EXT2_ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. */ static int ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct inode *ip, *pdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp); if (error) { return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { /* * if we are * not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) { if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~ISGID; } if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = ext2_update(tvp, !DOINGASYNC(tvp)); if (error) goto bad; #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ error = ext2_direnter(ip, dvp, cnp); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } /* * Vnode op for reading. */ static int ext2_read(struct vop_read_args *ap) { struct vnode *vp; struct inode *ip; struct uio *uio; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid, seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("%s: mode", "ext2_read"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("%s: short symlink", "ext2_read"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", "ext2_read", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0")); fs = ip->i_e2fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { u_int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error because the loop * above resets bp to NULL on each iteration and on normal * completion has not set a new value into it. so it must have come * from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) ip->i_flag |= IN_ACCESS; return (error); } static int ext2_ioctl(struct vop_ioctl_args *ap) { switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: return (vn_bmap_seekhole(ap->a_vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* * Vnode op for writing. */ static int ext2_write(struct vop_write_args *ap) { struct vnode *vp; struct uio *uio; struct inode *ip; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; seqcount = ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("%s: mode", "ext2_write"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: /* XXX differs from ffs -- this is called from ext2_mkdir(). */ if ((ioflag & IO_SYNC) == 0) panic("ext2_write: nonsync dir write"); break; default: panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp, vp->v_type, (intmax_t)uio->uio_offset, (intmax_t)uio->uio_resid); } KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0")); fs = ip->i_e2fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->e2fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error != 0) break; if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) ip->i_size = uio->uio_offset + xfersize; size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to ext2_balloc() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~(ISUID | ISGID); } if (error) { if (ioflag & IO_UNIT) { (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } if (uio->uio_resid != resid) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (ioflag & IO_SYNC) error = ext2_update(vp, 1); } return (error); } Index: projects/fuse2/sys/fs/fuse/fuse_vnops.c =================================================================== --- projects/fuse2/sys/fs/fuse/fuse_vnops.c (revision 350434) +++ projects/fuse2/sys/fs/fuse/fuse_vnops.c (revision 350435) @@ -1,2470 +1,2471 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_io.h" #include /* Maximum number of hardlinks to a single FUSE file */ #define FUSE_LINK_MAX UINT32_MAX SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_advlock_t fuse_vnop_advlock; static vop_bmap_t fuse_vnop_bmap; static vop_close_t fuse_fifo_close; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_deleteextattr_t fuse_vnop_deleteextattr; static vop_fdatasync_t fuse_vnop_fdatasync; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; static vop_inactive_t fuse_vnop_inactive; static vop_link_t fuse_vnop_link; static vop_listextattr_t fuse_vnop_listextattr; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_pathconf_t fuse_vnop_pathconf; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_setextattr_t fuse_vnop_setextattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_print_t fuse_vnop_print; static vop_vptofh_t fuse_vnop_vptofh; struct vop_vector fuse_fifoops = { .vop_default = &fifo_specops, .vop_access = fuse_vnop_access, .vop_close = fuse_fifo_close, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_inactive = fuse_vnop_inactive, .vop_pathconf = fuse_vnop_pathconf, .vop_print = fuse_vnop_print, .vop_read = VOP_PANIC, .vop_reclaim = fuse_vnop_reclaim, .vop_setattr = fuse_vnop_setattr, .vop_write = VOP_PANIC, .vop_vptofh = fuse_vnop_vptofh, }; struct vop_vector fuse_vnops = { .vop_allocate = VOP_EINVAL, .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_advlock = fuse_vnop_advlock, .vop_bmap = fuse_vnop_bmap, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, .vop_fdatasync = fuse_vnop_fdatasync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, /* * TODO: implement vop_ioctl after upgrading to protocol 7.16. * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until * 7.16. */ .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, /* * TODO: implement vop_poll after upgrading to protocol 7.21. * FUSE_POLL was added in protocol 7.11, but it's kind of broken until * 7.21, which adds the ability for the client to choose which poll * events it wants, and for a client to deregister a file handle */ .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_setextattr = fuse_vnop_setextattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_print = fuse_vnop_print, .vop_vptofh = fuse_vnop_vptofh, }; uma_zone_t fuse_pbuf_zone; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); #define fuse_vm_page_lock_queues() ((void)0) #define fuse_vm_page_unlock_queues() ((void)0) /* Check permission for extattr operations, much like extattr_check_cred */ static int fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred, struct thread *td, accmode_t accmode) { struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (ns) { case EXTATTR_NAMESPACE_SYSTEM: if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) { return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); } /* FALLTHROUGH */ case EXTATTR_NAMESPACE_USER: return (fuse_internal_access(vp, accmode, td, cred)); default: return (EPERM); } } /* Get a filehandle for a directory */ static int fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp, struct ucred *cred, pid_t pid) { if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0) return 0; return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid); } /* Send FUSE_FLUSH for this vnode */ static int fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag) { struct fuse_flush_in *ffi; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct thread *td = curthread; struct mount *mp = vnode_mount(vp); int err; if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH)) return 0; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*ffi)); fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred); ffi = fdi.indata; ffi->fh = fufh->fh_id; /* * If the file has a POSIX lock then we're supposed to set lock_owner. * If not, then lock_owner is undefined. So we may as well always set * it. */ ffi->lock_owner = td->td_proc->p_pid; err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_FLUSH); err = 0; } fdisp_destroy(&fdi); return err; } /* Close wrapper for fifos. */ static int fuse_fifo_close(struct vop_close_args *ap) { return (fifo_specops.vop_close(ap)); } /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred); return err; } /* * struct vop_advlock_args { * struct vop_generic_args a_gen; * struct vnode *a_vp; * void *a_id; * int a_op; * struct flock *a_fl; * int a_flags; * } */ static int fuse_vnop_advlock(struct vop_advlock_args *ap) { struct vnode *vp = ap->a_vp; struct flock *fl = ap->a_fl; struct thread *td = curthread; struct ucred *cred = td->td_ucred; pid_t pid = td->td_proc->p_pid; struct fuse_filehandle *fufh; struct fuse_dispatcher fdi; struct fuse_lk_in *fli; struct fuse_lk_out *flo; enum fuse_opcode op; int dataflags, err; int flags = ap->a_flags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!(dataflags & FSESS_POSIX_LOCKS)) return vop_stdadvlock(ap); /* FUSE doesn't properly support flock until protocol 7.17 */ if (flags & F_FLOCK) return vop_stdadvlock(ap); err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid); if (err) return err; fdisp_init(&fdi, sizeof(*fli)); switch(ap->a_op) { case F_GETLK: op = FUSE_GETLK; break; case F_SETLK: op = FUSE_SETLK; break; case F_SETLKW: op = FUSE_SETLKW; break; default: return EINVAL; } fdisp_make_vp(&fdi, op, vp, td, cred); fli = fdi.indata; fli->fh = fufh->fh_id; fli->owner = fl->l_pid; fli->lk.start = fl->l_start; if (fl->l_len != 0) fli->lk.end = fl->l_start + fl->l_len - 1; else fli->lk.end = INT64_MAX; fli->lk.type = fl->l_type; fli->lk.pid = fl->l_pid; err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); if (err == 0 && op == FUSE_GETLK) { flo = fdi.answ; fl->l_type = flo->lk.type; fl->l_pid = flo->lk.pid; if (flo->lk.type != F_UNLCK) { fl->l_start = flo->lk.start; if (flo->lk.end == INT64_MAX) fl->l_len = 0; else fl->l_len = flo->lk.end - flo->lk.start + 1; fl->l_start = flo->lk.start; } } return err; } /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ static int fuse_vnop_bmap(struct vop_bmap_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj **bo = ap->a_bop; struct thread *td = curthread; struct mount *mp; struct fuse_dispatcher fdi; struct fuse_bmap_in *fbi; struct fuse_bmap_out *fbo; struct fuse_data *data; uint64_t biosize; off_t filesize; daddr_t lbn = ap->a_bn; daddr_t *pbn = ap->a_bnp; int *runp = ap->a_runp; int *runb = ap->a_runb; int error = 0; int maxrun; if (fuse_isdeadfs(vp)) { return ENXIO; } mp = vnode_mount(vp); data = fuse_get_mpdata(mp); biosize = fuse_iosize(vp); maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1, data->max_readahead_blocks); if (bo != NULL) *bo = &vp->v_bufobj; /* * The FUSE_BMAP operation does not include the runp and runb * variables, so we must guess. Report nonzero contiguous runs so * cluster_read will combine adjacent reads. It's worthwhile to reduce * upcalls even if we don't know the true physical layout of the file. * * FUSE file systems may opt out of read clustering in two ways: * * mounting with -onoclusterr * * Setting max_readahead <= maxbcachebuf during FUSE_INIT */ if (runb != NULL) *runb = MIN(lbn, maxrun); if (runp != NULL) { error = fuse_vnode_size(vp, &filesize, td->td_ucred, td); if (error == 0) *runp = MIN(MAX(0, filesize / biosize - lbn - 1), maxrun); else *runp = 0; } if (fsess_isimpl(mp, FUSE_BMAP)) { fdisp_init(&fdi, sizeof(*fbi)); fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred); fbi = fdi.indata; fbi->block = lbn; fbi->blocksize = biosize; error = fdisp_wait_answ(&fdi); if (error == ENOSYS) { fdisp_destroy(&fdi); fsess_set_notimpl(mp, FUSE_BMAP); error = 0; } else { fbo = fdi.answ; if (error == 0 && pbn != NULL) *pbn = fbo->block; fdisp_destroy(&fdi); return error; } } /* If the daemon doesn't support BMAP, make up a sensible default */ if (pbn != NULL) *pbn = lbn * btodb(biosize); return (error); } /* struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; struct thread *td = ap->a_td; pid_t pid = td->td_proc->p_pid; int err = 0; if (fuse_isdeadfs(vp)) return 0; if (vnode_isdir(vp)) return 0; if (fflag & IO_NDELAY) return 0; err = fuse_flush(vp, cred, pid, fflag); /* TODO: close the file handle, if we're sure it's no longer used */ if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred, td->td_proc->p_pid); } return err; } static void fdisp_make_mknod_for_fallback( struct fuse_dispatcher *fdip, struct componentname *cnp, struct vnode *dvp, uint64_t parentnid, struct thread *td, struct ucred *cred, mode_t mode, enum fuse_opcode *op) { struct fuse_mknod_in *fmni; fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1); *op = FUSE_MKNOD; fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred); fmni = fdip->indata; fmni->mode = mode; fmni->rdev = 0; memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0'; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; struct fuse_data *data; struct fuse_create_in *fci; struct fuse_entry_out *feo; struct fuse_open_out *foo; struct fuse_dispatcher fdi, fdi2; struct fuse_dispatcher *fdip = &fdi; struct fuse_dispatcher *fdip2 = NULL; int err; struct mount *mp = vnode_mount(dvp); data = fuse_get_mpdata(mp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); enum fuse_opcode op; int flags; if (fuse_isdeadfs(dvp)) return ENXIO; /* FUSE expects sockets to be created with FUSE_MKNOD */ if (vap->va_type == VSOCK) return fuse_internal_mknod(dvp, vpp, cnp, vap); /* * VOP_CREATE doesn't tell us the open(2) flags, so we guess. Only a * writable mode makes sense, and we might as well include readability * too. */ flags = O_RDWR; bzero(&fdi, sizeof(fdi)); if (vap->va_type != VREG) return (EINVAL); if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) { /* Fallback to FUSE_MKNOD/FUSE_OPEN */ fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); } else { /* Use FUSE_CREATE */ size_t insize; op = FUSE_CREATE; fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1); fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred); fci = fdip->indata; fci->mode = mode; fci->flags = O_CREAT | flags; if (fuse_libabi_geq(data, 7, 12)) { insize = sizeof(*fci); fci->umask = td->td_proc->p_fd->fd_cmask; } else { insize = sizeof(struct fuse_open_in); } memcpy((char *)fdip->indata + insize, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0'; } err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS && op == FUSE_CREATE) { fsess_set_notimpl(mp, FUSE_CREATE); fdisp_destroy(fdip); fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td, cred, mode, &op); err = fdisp_wait_answ(fdip); } if (err) goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, vap->va_type))) { goto out; } if (op == FUSE_CREATE) { foo = (struct fuse_open_out*)(feo + 1); } else { /* Issue a separate FUSE_OPEN */ struct fuse_open_in *foi; fdip2 = &fdi2; fdisp_init(fdip2, sizeof(*foi)); fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td, cred); foi = fdip2->indata; foi->flags = flags; err = fdisp_wait_answ(fdip2); if (err) goto out; foo = fdip2->answ; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = foo->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = flags; fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick, false); goto out; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo); fuse_vnode_open(*vpp, foo->open_flags, td); /* * Purge the parent's attribute cache because the daemon should've * updated its mtime and ctime */ fuse_vnode_clear_attr_cache(dvp); cache_purge_negative(dvp); out: if (fdip2) fdisp_destroy(fdip2); fdisp_destroy(fdip); return err; } /* struct vnop_fdatasync_args { struct vop_generic_args a_gen; struct vnode * a_vp; struct thread * a_td; }; */ static int fuse_vnop_fdatasync(struct vop_fdatasync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = MNT_WAIT; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfdatasync_buf(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, true); } /* struct vnop_fsync_args { struct vop_generic_args a_gen; struct vnode * a_vp; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; int waitfor = ap->a_waitfor; int err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; return fuse_internal_fsync(vp, td, waitfor, false); } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int err = 0; int dataflags; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; return err; } else { goto fake; } } err = fuse_internal_getattr(vp, vap, cred, td); if (err == ENOTCONN && vnode_isvroot(vp)) { /* see comment in fuse_vfsop_statfs() */ goto fake; } else { return err; } fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; int need_flush = 1; LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL, 0); } if ((fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, fufh, td, NULL); } if ((fvdat->flag & FN_REVOKED) != 0) vrecycle(vp); return 0; } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } /* * This is a seatbelt check to protect naive userspace filesystems from * themselves and the limitations of the FUSE IPC protocol. If a * filesystem does not allow attribute caching, assume it is capable of * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) return EMLINK; fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); if (!err) { /* * Purge the parent's attribute cache because the daemon * should've updated its mtime and ctime */ fuse_vnode_clear_attr_cache(tdvp); fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); } out: fdisp_destroy(&fdi); return err; } struct fuse_lookup_alloc_arg { struct fuse_entry_out *feo; struct componentname *cnp; uint64_t nid; enum vtype vtyp; }; /* Callback for vn_get_ino */ static int fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { struct fuse_lookup_alloc_arg *flaa = arg; return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp, flaa->vtyp); } SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup, "int", "struct timespec*", "struct timespec*"); /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; bool did_lookup = false; struct fuse_entry_out *feo = NULL; enum vtype vtyp; /* vnode type of target */ off_t filesize; /* filesize of target */ uint64_t nid; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) return ENOTDIR; if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) return EROFS; if ((err = fuse_internal_access(dvp, VEXEC, td, cred))) return err; if (flags & ISDOTDOT) { KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID, ("Looking up .. is TODO")); nid = VTOFUD(dvp)->parent_nid; if (nid == 0) return ENOENT; /* .. is obviously a directory */ vtyp = VDIR; filesize = 0; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); /* . is obviously a directory */ vtyp = VDIR; filesize = 0; } else { struct timespec now, timeout; err = cache_lookup(dvp, vpp, cnp, &timeout, NULL); getnanouptime(&now); SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now); switch (err) { case -1: /* positive match */ if (timespeccmp(&timeout, &now, >)) { counter_u64_add(fuse_lookup_cache_hits, 1); } else { /* Cache timeout */ counter_u64_add(fuse_lookup_cache_misses, 1); bintime_clear( &VTOFUD(*vpp)->entry_cache_timeout); cache_purge(*vpp); if (dvp != *vpp) vput(*vpp); else vrele(*vpp); *vpp = NULL; break; } return 0; case 0: /* no match in cache */ counter_u64_add(fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ getnanouptime(&now); if (timespeccmp(&timeout, &now, <=)) { /* Cache timeout */ cache_purge_negative(dvp); break; } /* fall through */ default: return err; } nid = VTOI(dvp); fdisp_init(&fdi, cnp->cn_namelen + 1); fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; lookup_err = fdisp_wait_answ(&fdi); did_lookup = true; if (!lookup_err) { /* lookup call succeeded */ feo = (struct fuse_entry_out *)fdi.answ; nid = feo->nodeid; if (nid == 0) { /* zero nodeid means ENOENT and cache it */ struct timespec timeout; fdi.answ_stat = ENOENT; lookup_err = ENOENT; if (cnp->cn_flags & MAKEENTRY) { fuse_validity_2_timespec(feo, &timeout); cache_enter_time(dvp, *vpp, cnp, &timeout, NULL); } } else if (nid == FUSE_ROOT_ID) { lookup_err = EINVAL; } vtyp = IFTOVT(feo->attr.mode); filesize = feo->attr.size; } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) { fdisp_destroy(&fdi); return lookup_err; } } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { /* Entry not found */ if ((nameiop == CREATE || nameiop == RENAME) && islastcn) { err = fuse_internal_access(dvp, VWRITE, td, cred); if (!err) { /* * Set the SAVENAME flag to hold onto the * pathname for use later in VOP_CREATE or * VOP_RENAME. */ cnp->cn_flags |= SAVENAME; err = EJUSTRETURN; } } else { err = ENOENT; } } else { /* Entry was found */ if (flags & ISDOTDOT) { struct fuse_lookup_alloc_arg flaa; flaa.nid = nid; flaa.feo = feo; flaa.cnp = cnp; flaa.vtyp = vtyp; err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0, &vp); *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, vtyp); if (err) goto out; *vpp = vp; /* * In the case where we are looking up a FUSE node * represented by an existing cached vnode, and the * true size reported by FUSE_LOOKUP doesn't match * the vnode's cached size, then any cached writes * beyond the file's current size are lost. * * We can get here: * * following attribute cache expiration, or * * due a bug in the daemon, or */ fvdat = VTOFUD(vp); if (vnode_isreg(vp) && filesize != fvdat->cached_attrs.va_size && fvdat->flag & FN_SIZECHANGE) { /* * The FN_SIZECHANGE flag reflects a dirty * append. If userspace lets us know our cache * is invalid, that write was lost. (Dirty * writes that do not cause append are also * lost, but we don't detect them here.) * * XXX: Maybe disable WB caching on this mount. */ printf("%s: WB cache incoherent on %s!\n", __func__, vnode_mount(vp)->mnt_stat.f_mntonname); fvdat->flag &= ~FN_SIZECHANGE; } MPASS(feo != NULL); fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, &fvdat->entry_cache_timeout); if ((nameiop == DELETE || nameiop == RENAME) && islastcn) { struct vattr dvattr; err = fuse_internal_access(dvp, VWRITE, td, cred); if (err != 0) goto out; /* * if the parent's sticky bit is set, check * whether we're allowed to remove the file. * Need to figure out the vnode locking to make * this work. */ fuse_internal_getattr(dvp, &dvattr, cred, td); if ((dvattr.va_mode & S_ISTXT) && fuse_internal_access(dvp, VADMIN, td, cred) && fuse_internal_access(*vpp, VADMIN, td, cred)) { err = EPERM; goto out; } } if (islastcn && ( (nameiop == DELETE) || (nameiop == RENAME && wantparent))) { cnp->cn_flags |= SAVENAME; } } } out: if (err) { if (vp != NULL && dvp != vp) vput(vp); else if (vp != NULL) vrele(vp); *vpp = NULL; } if (did_lookup) fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); fmdi.umask = curthread->td_proc->p_fd->fd_cmask; return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; if (fuse_isdeadfs(dvp)) return ENXIO; return fuse_internal_mknod(dvp, vpp, cnp, vap); } /* struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int a_mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; pid_t pid = td->td_proc->p_pid; struct fuse_vnode_data *fvdat; if (fuse_isdeadfs(vp)) return ENXIO; if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO) return (EOPNOTSUPP); if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0) return EINVAL; fvdat = VTOFUD(vp); if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) { fuse_vnode_open(vp, 0, td); return 0; } return fuse_filehandle_open(vp, a_mode, NULL, td, cred); } static int fuse_vnop_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX); return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } } /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct fuse_iov cookediov; int err = 0; u_long *cookies; off_t startoff; ssize_t tresid; int ncookies; bool closefufh = false; pid_t pid = curthread->td_proc->p_pid; if (ap->a_eofflag) *ap->a_eofflag = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } tresid = uio->uio_resid; startoff = uio->uio_offset; err = fuse_filehandle_get_dir(vp, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do VOP_READDIR without first doing VOP_OPEN. We * must implicitly open the directory here */ err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred); if (err == 0) { /* * When a directory is opened, it must be read from * the beginning. Hopefully, the "startoff" still * exists as an offset cookie for the directory. * If not, it will read the entire directory without * returning any entries and just return eof. */ uio->uio_offset = 0; } closefufh = true; } if (err) return (err); if (ap->a_ncookies != NULL) { ncookies = uio->uio_resid / (offsetof(struct dirent, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov, &ncookies, cookies); fiov_teardown(&cookediov); if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); if (ap->a_ncookies != NULL) { if (err == 0) { *ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (err == 0 && tresid == uio->uio_resid) *ap->a_eofflag = 1; return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh, *fufh_tmp; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) { printf("FUSE: vnode being reclaimed with open fufh " "(type=%#x)", fufh->fufh_type); fuse_filehandle_close(vp, fufh, td, NULL); } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } fuse_vnode_setparent(vp, NULL); cache_purge(vp); vfs_hash_remove(vp); vnode_destroy_vobject(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; bool newparent = fdvp != tdvp; bool isdir = fvp->v_type == VDIR; int err = 0; if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ /* * If source is a directory, and it will get a new parent, user must * have write permission to it, so ".." can be modified. */ data = fuse_get_mpdata(vnode_mount(tdvp)); if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) { err = fuse_internal_access(fvp, VWRITE, tcnp->cn_thread, tcnp->cn_cred); if (err) goto out; } sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if ((tvp != NULL) && vnode_isdir(tvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct mount *mp; struct fuse_data *data; struct vattr old_va; int dataflags; int err = 0, err2; accmode_t accmode = 0; bool checkperm; bool drop_suid = false; gid_t cr_gid; mp = vnode_mount(vp); data = fuse_get_mpdata(mp); dataflags = data->dataflags; checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS; if (cred->cr_ngroups > 0) cr_gid = cred->cr_groups[0]; else cr_gid = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vap->va_uid != (uid_t)VNOVAL) { if (checkperm) { /* Only root may change a file's owner */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chown */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_uid != old_va.va_uid) return err; else accmode |= VADMIN; drop_suid = true; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_gid != (gid_t)VNOVAL) { if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN)) drop_suid = true; if (checkperm && !groupmember(vap->va_gid, cred)) { /* * Non-root users may only chgrp to one of their own * groups */ err = priv_check_cred(cred, PRIV_VFS_CHOWN); if (err) { /* As a special case, allow the null chgrp */ err2 = fuse_internal_getattr(vp, &old_va, cred, td); if (err2) return (err2); if (vap->va_gid != old_va.va_gid) return err; accmode |= VADMIN; } else accmode |= VADMIN; } else accmode |= VADMIN; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vfs_isrdonly(mp)) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } /* Don't set accmode. Permission to trunc is checked upstack */ } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vap->va_vaflags & VA_UTIMES_NULL) accmode |= VWRITE; else accmode |= VADMIN; } if (drop_suid) { if (vap->va_mode != (mode_t)VNOVAL) vap->va_mode &= ~(S_ISUID | S_ISGID); else { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID); } } if (vap->va_mode != (mode_t)VNOVAL) { /* Only root may set the sticky bit on non-directories */ if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT) && priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return EFTYPE; if (checkperm && (vap->va_mode & S_ISGID)) { err = fuse_internal_getattr(vp, &old_va, cred, td); if (err) return (err); if (!groupmember(old_va.va_gid, cred)) { err = priv_check_cred(cred, PRIV_VFS_SETGID); if (err) return (err); } } accmode |= VADMIN; } if (vfs_isrdonly(mp)) return EROFS; err = fuse_internal_access(vp, accmode, td, cred); if (err) return err; else return fuse_internal_setattr(vp, vap, td, cred); } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return 0; } /* * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags. * fuse_io_strategy sets bp's error fields */ (void)fuse_io_strategy(vp, bp); return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; pid_t pid = curthread->td_proc->p_pid; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred, pid); } static daddr_t fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { const int biosize = fuse_iosize(vp); return (off / biosize); } static int fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn) { off_t filesize; int blksz, err; const int biosize = fuse_iosize(vp); err = fuse_vnode_size(vp, &filesize, NULL, NULL); KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here")); if (err) return biosize; if ((off_t)lbn * biosize >= filesize) { blksz = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { blksz = filesize - (off_t)lbn *biosize; } else { blksz = biosize; } return (blksz); } /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { struct vnode *vp = ap->a_vp; if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fusefs, , vnops, trace, 1, "called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz)); } static const char extattr_namespace_separator = '.'; /* struct vop_getextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_getxattr_in *get_xattr_in; struct fuse_getxattr_out *get_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *attr_str; size_t len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_GETXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*get_xattr_in)); fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred); get_xattr_in = fdi.indata; /* * Check to see whether we're querying the available size or * issuing the actual request. If we pass in 0, we get back struct * fuse_getxattr_out. If we pass in a non-zero size, we get back * that much data, without the struct fuse_getxattr_out header. */ if (uio == NULL) get_xattr_in->size = 0; else get_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*get_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_GETXATTR); err = EOPNOTSUPP; } goto out; } get_xattr_out = fdi.answ; if (ap->a_size != NULL) *ap->a_size = get_xattr_out->size; if (uio != NULL) err = uiomove(fdi.answ, fdi.iosize, uio); out: fdisp_destroy(&fdi); return (err); } /* struct vop_setextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_setxattr_in *set_xattr_in; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_SETXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; /* Deleting xattrs must use VOP_DELETEEXTATTR instead */ if (ap->a_uio == NULL) { /* * If we got here as fallback from VOP_DELETEEXTATTR, then * return EOPNOTSUPP. */ if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) return (EOPNOTSUPP); else return (EINVAL); } err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*set_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len, uio->uio_resid, uio); if (err != 0) { goto out; } err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_SETXATTR); err = EOPNOTSUPP; } if (err == ERESTART) { /* Can't restart after calling uiomove */ err = EINTR; } out: fdisp_destroy(&fdi); return (err); } /* * The Linux / FUSE extended attribute list is simply a collection of * NUL-terminated strings. The FreeBSD extended attribute list is a single * byte length followed by a non-NUL terminated string. So, this allows * conversion of the Linux / FUSE format to the FreeBSD format in place. * Linux attribute names are reported with the namespace as a prefix (e.g. * "user.attribute_name"), but in FreeBSD they are reported without the * namespace prefix (e.g. "attribute_name"). So, we're going from: * * user.attr_name1\0user.attr_name2\0 * * to: * * attr_name1attr_name2 * * Where "" is a single byte number of characters in the attribute name. * * Args: * prefix - exattr namespace prefix string * list, list_len - input list with namespace prefixes * bsd_list, bsd_list_len - output list compatible with bsd vfs */ static int fuse_xattrlist_convert(char *prefix, const char *list, int list_len, char *bsd_list, int *bsd_list_len) { int len, pos, dist_to_next, prefix_len; pos = 0; *bsd_list_len = 0; prefix_len = strlen(prefix); while (pos < list_len && list[pos] != '\0') { dist_to_next = strlen(&list[pos]) + 1; if (bcmp(&list[pos], prefix, prefix_len) == 0 && list[pos + prefix_len] == extattr_namespace_separator) { len = dist_to_next - (prefix_len + sizeof(extattr_namespace_separator)) - 1; if (len >= EXTATTR_MAXNAMELEN) return (ENAMETOOLONG); bsd_list[*bsd_list_len] = len; memcpy(&bsd_list[*bsd_list_len + 1], &list[pos + prefix_len + sizeof(extattr_namespace_separator)], len); *bsd_list_len += len + 1; } pos += dist_to_next; } return (0); } /* struct vop_listextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_listxattr_in *list_xattr_in; struct fuse_listxattr_out *list_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; size_t len; char *prefix; char *attr_str; char *bsd_list = NULL; char *linux_list; int bsd_list_len; int linux_list_len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_LISTXATTR)) return EOPNOTSUPP; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD); if (err) return err; /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + 1; fdisp_init(&fdi, sizeof(*list_xattr_in) + len); fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); /* * Retrieve Linux / FUSE compatible list size. */ list_xattr_in = fdi.indata; list_xattr_in->size = 0; attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_LISTXATTR); err = EOPNOTSUPP; } goto out; } list_xattr_out = fdi.answ; linux_list_len = list_xattr_out->size; if (linux_list_len == 0) { if (ap->a_size != NULL) *ap->a_size = linux_list_len; goto out; } /* * Retrieve Linux / FUSE compatible list values. */ fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out); attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator); err = fdisp_wait_answ(&fdi); if (err != 0) goto out; linux_list = fdi.answ; linux_list_len = fdi.iosize; /* * Retrieve the BSD compatible list values. * The Linux / FUSE attribute list format isn't the same * as FreeBSD's format. So we need to transform it into * FreeBSD's format before giving it to the user. */ bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK); err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len, bsd_list, &bsd_list_len); if (err != 0) goto out; if (ap->a_size != NULL) *ap->a_size = bsd_list_len; if (uio != NULL) err = uiomove(bsd_list, bsd_list_len, uio); out: free(bsd_list, M_TEMP); fdisp_destroy(&fdi); return (err); } /* struct vop_deleteextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); if (!fsess_isimpl(mp, FUSE_REMOVEXATTR)) return EOPNOTSUPP; if (vfs_isrdonly(mp)) return EROFS; err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VWRITE); if (err) return err; /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len); fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred); attr_str = fdi.indata; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err == ENOSYS) { fsess_set_notimpl(mp, FUSE_REMOVEXATTR); err = EOPNOTSUPP; } fdisp_destroy(&fdi); return (err); } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } /* * Get an NFS filehandle for a FUSE file. * * This will only work for FUSE file systems that guarantee the uniqueness of * nodeid:generation, which most don't. */ /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ static int fuse_vnop_vptofh(struct vop_vptofh_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp); _Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid), "FUSE fid type is too big"); struct mount *mp = vnode_mount(vp); struct fuse_data *data = fuse_get_mpdata(mp); struct vattr va; int err; if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) return EOPNOTSUPP; err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread); if (err) return err; /*ip = VTOI(ap->a_vp);*/ /*ufhp = (struct ufid *)ap->a_fhp;*/ fhp->len = sizeof(struct fuse_fid); fhp->nid = fvdat->nid; if (fvdat->generation <= UINT32_MAX) fhp->gen = fvdat->generation; else return EOVERFLOW; return (0); } Index: projects/fuse2/sys/kern/kern_event.c =================================================================== --- projects/fuse2/sys/kern/kern_event.c (revision 350434) +++ projects/fuse2/sys/kern/kern_event.c (revision 350435) @@ -1,2740 +1,2741 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int kq_ncallouts = 0; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_locked((knl)->kl_lockarg); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_unlocked((knl)->kl_lockarg); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; MPASS(list != NULL); KNL_ASSERT_LOCKED(list); if (SLIST_EMPTY(&list->kl_list)) return; memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); list->kl_lock(list->kl_lockarg); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); } } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(intptr_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; static void filt_timerexpire(void *knx) { struct knote *kn; struct kq_timer_cb_data *kc; kn = knx; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != 0) return; kc = kn->kn_ptr.p_v; if (kc->to == 0) return; kc->next += kc->to; callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } /* * data contains amount of time to sleep */ static int filt_timervalidate(struct knote *kn, sbintime_t *to) { struct bintime bt; sbintime_t sbt; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* * The only fflags values supported are the timer unit * (precision) and the absolute time indicator. */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); *to -= sbt; } if (*to < 0) return (EINVAL); return (0); } static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; sbintime_t to; unsigned int ncallouts; int error; error = filt_timervalidate(kn, &to); if (error != 0) return (error); do { ncallouts = kq_ncallouts; if (ncallouts >= kq_calloutmax) return (ENOMEM); } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); callout_init(&kc->c, 1); filt_timerstart(kn, to); return (0); } static void filt_timerstart(struct knote *kn, sbintime_t to) { struct kq_timer_cb_data *kc; kc = kn->kn_ptr.p_v; if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old __unused; kc = kn->kn_ptr.p_v; callout_drain(&kc->c); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) { struct kq_timer_cb_data *kc; struct kqueue *kq; sbintime_t to; int error; switch (type) { case EVENT_REGISTER: /* Handle re-added timers that update data/fflags */ if (kev->flags & EV_ADD) { kc = kn->kn_ptr.p_v; /* Drain any existing callout. */ callout_drain(&kc->c); /* Throw away any existing undelivered record * of the timer expiration. This is done under * the presumption that if a process is * re-adding this timer with new parameters, * it is no longer interested in what may have * happened under the old parameters. If it is * interested, it can wait for the expiration, * delete the old timer definition, and then * add the new one. * * This has to be done while the kq is locked: * - if enqueued, dequeue * - make it no longer active * - clear the count of expiration events */ kq = kn->kn_kq; KQ_LOCK(kq); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; kn->kn_data = 0; KQ_UNLOCK(kq); /* Reschedule timer based on new data/fflags */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; error = filt_timervalidate(kn, &to); if (error != 0) { kn->kn_flags |= EV_ERROR; kn->kn_data = error; } else filt_timerstart(kn, to); } break; case EVENT_PROCESS: *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_timertouch() - invalid type (%ld)", type); break; } } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init(&rights); if (nchanges > 0) cap_rights_set(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, M_WAITOK); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(mflag); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, M_NOWAIT) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, mflag); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) { error = kqueue_expand(kq, fops, kev->ident, mflag); if (error != 0) goto done; } KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } done_ev_add: /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already, e.g., filt_procattach() is called on a zombie process. It * will call filt_proc() which will remove it from the list, and NULL * kn_knlist. * * KN_DISABLED will be stable while the knote is in flux, so the * unlocked read will not race with an update. */ if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int error, fd, size; KQ_NOTOWNED(kq); error = 0; to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = list; error = EBADF; } else if (kq->kq_knlistsize > fd) { to_free = list; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? HASH_WAITOK : HASH_NOWAIT); if (tmp_knhash == NULL) return (ENOMEM); KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = tmp_knhash; error = EBADF; } else if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return (error); } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(M_WAITOK); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_locked(void *arg) { mtx_assert((struct mtx *)arg, MA_OWNED); } static void knlist_mtx_assert_unlocked(void *arg) { mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_locked(void *arg) { rw_assert((struct rwlock *)arg, RA_LOCKED); } static void knlist_rw_assert_unlocked(void *arg) { rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_locked == NULL) knl->kl_assert_locked = knlist_mtx_assert_locked; else knl->kl_assert_locked = kl_assert_locked; if (kl_assert_unlocked == NULL) knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; else knl->kl_assert_unlocked = kl_assert_unlocked; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_locked, knlist_rw_assert_unlocked); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if ((kq->kq_state & KQ_CLOSING) != 0) return (EBADF); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int mflag) { return (uma_zalloc(knote_zone, mflag | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, mflag); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } Index: projects/fuse2/sys/kern/kern_sendfile.c =================================================================== --- projects/fuse2/sys/kern/kern_sendfile.c (revision 350434) +++ projects/fuse2/sys/kern/kern_sendfile.c (revision 350435) @@ -1,1194 +1,1191 @@ /*- * Copyright (c) 2013-2015 Gleb Smirnoff * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 #define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 -SDT_PROVIDER_DECLARE(vfs); - /* * Structure describing a single sendfile(2) I/O, which may consist of * several underlying pager I/Os. * * The syscall context allocates the structure and initializes 'nios' * to 1. As sendfile_swapin() runs through pages and starts asynchronous * paging operations, it increments 'nios'. * * Every I/O completion calls sendfile_iodone(), which decrements the 'nios', * and the syscall also calls sendfile_iodone() after allocating all mbufs, * linking them and sending to socket. Whoever reaches zero 'nios' is * responsible to * call pru_ready on the socket, to notify it of readyness * of the data. */ struct sf_io { volatile u_int nios; u_int error; int npages; struct socket *so; struct mbuf *m; vm_page_t pa[]; }; /* * Structure used to track requests with SF_SYNC flag. */ struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; }; counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Detach mapped page and release resources back to the system. Called * by mbuf(9) code when last reference to a page is freed. */ static void sendfile_free_page(vm_page_t pg, bool nocache) { bool freed; vm_page_lock(pg); /* * In either case check for the object going away on us. This can * happen since we don't hold a reference to it. If so, we're * responsible for freeing the page. In 'noncache' case try to free * the page, but only if it is cheap to. */ if (vm_page_unwire_noq(pg)) { vm_object_t obj; if ((obj = pg->object) == NULL) vm_page_free(pg); else { freed = false; if (nocache && !vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { /* Only free unmapped pages. */ if (obj->ref_count == 0 || !pmap_page_is_mapped(pg)) /* * The busy test before the object is * locked cannot be relied upon. */ freed = vm_page_try_to_free(pg); VM_OBJECT_WUNLOCK(obj); } if (!freed) { /* * If we were asked to not cache the page, place * it near the head of the inactive queue so * that it is reclaimed sooner. Otherwise, * maintain LRU. */ if (nocache) vm_page_deactivate_noreuse(pg); else if (vm_page_active(pg)) vm_page_reference(pg); else vm_page_deactivate(pg); } } } vm_page_unlock(pg); } static void sendfile_free_mext(struct mbuf *m) { struct sf_buf *sf; vm_page_t pg; bool nocache; KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF, ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m)); sf = m->m_ext.ext_arg1; pg = sf_buf_page(sf); nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; sf_buf_free(sf); sendfile_free_page(pg, nocache); if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { struct sendfile_sync *sfs = m->m_ext.ext_arg2; mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } static void sendfile_free_mext_pg(struct mbuf *m) { struct mbuf_ext_pgs *ext_pgs; vm_page_t pg; int i; bool nocache, cache_last; KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS, ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)); nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST; ext_pgs = m->m_ext.ext_pgs; for (i = 0; i < ext_pgs->npgs; i++) { if (cache_last && i == ext_pgs->npgs - 1) nocache = false; pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); sendfile_free_page(pg, nocache); } if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { struct sendfile_sync *sfs = m->m_ext.ext_arg2; mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. */ static inline off_t xfsize(int i, int n, off_t off, off_t len) { if (i == 0) return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) return ((off + len) & PAGE_MASK); return (PAGE_SIZE); } /* * Helper function to get offset within object for i page. */ static inline vm_ooffset_t vmoff(int i, off_t off) { if (i == 0) return ((vm_ooffset_t)off); return (trunc_page(off + i * PAGE_SIZE)); } /* * Helper function used when allocation of a page or sf_buf failed. * Pretend as if we don't have enough space, subtract xfsize() of * all pages that failed. */ static inline void fixspace(int old, int new, off_t off, int *space) { KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); /* Subtract last one. */ *space -= xfsize(old - 1, old, off, *space); old--; if (new == old) /* There was only one page. */ return; /* Subtract first one. */ if (new == 0) { *space -= xfsize(0, old, off, *space); new++; } /* Rest of pages are full sized. */ *space -= (old - new) * PAGE_SIZE; KASSERT(*space >= 0, ("%s: space went backwards", __func__)); } /* * I/O completion callback. */ static void sendfile_iodone(void *arg, vm_page_t *pg, int count, int error) { struct sf_io *sfio = arg; struct socket *so = sfio->so; for (int i = 0; i < count; i++) if (pg[i] != bogus_page) vm_page_xunbusy(pg[i]); if (error) sfio->error = error; if (!refcount_release(&sfio->nios)) return; CURVNET_SET(so->so_vnet); if (sfio->error) { /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear * it down. Protocol abort method would tear down protocol * state, free all ready mbufs and detach not ready ones. * We will free the mbufs corresponding to this I/O manually. * * The socket would be marked with EIO and made available * for read, so that application receives EIO on next * syscall and eventually closes the socket. */ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(sfio->m, sfio->npages); } else (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); free(sfio, M_TEMP); } -SDT_PROBE_DEFINE1(vfs, sendfile, swapin, pager_error, "int"); /* * Iterate through pages vector and request paging for non-valid pages. */ static int sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off, off_t len, int npages, int rhpages, int flags) { vm_page_t *pa = sfio->pa; int grabbed; *nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; /* * First grab all the pages and wire them. Note that we grab * only required pages. Readahead pages are dealt with later. */ VM_OBJECT_WLOCK(obj); grabbed = vm_page_grab_pages(obj, OFF_TO_IDX(off), VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages); if (grabbed < npages) { for (int i = grabbed; i < npages; i++) pa[i] = NULL; npages = grabbed; rhpages = 0; } for (int i = 0; i < npages;) { int j, a, count, rv; /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { vm_page_xunbusy(pa[i]); SFSTAT_INC(sf_pages_valid); i++; continue; } /* * Next page is invalid. Check if it belongs to pager. It * may not be there, which is a regular situation for shmem * pager. For vnode pager this happens only in case of * a sparse file. * * Important feature of vm_pager_has_page() is the hint * stored in 'a', about how many pages we can pagein after * this page in a single I/O. */ if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL, &a)) { pmap_zero_page(pa[i]); pa[i]->valid = VM_PAGE_BITS_ALL; MPASS(pa[i]->dirty == 0); vm_page_xunbusy(pa[i]); i++; continue; } /* * We want to pagein as many pages as possible, limited only * by the 'a' hint and actual request. */ count = min(a + 1, npages - i); /* * We should not pagein into a valid page, thus we first trim * any valid pages off the end of request, and substitute * to bogus_page those, that are in the middle. */ for (j = i + count - 1; j > i; j--) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { count--; rhpages = 0; } else break; } for (j = i + 1; j < i + count - 1; j++) if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { vm_page_xunbusy(pa[j]); SFSTAT_INC(sf_pages_valid); SFSTAT_INC(sf_pages_bogus); pa[j] = bogus_page; } refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); if (rv != VM_PAGER_OK) { - SDT_PROBE1(vfs, sendfile, swapin, pager_error, rv); - for (j = 0; j < count; j++) { - vm_page_lock(*(pa + i + j)); - vm_page_unwire(*(pa + i + j), PQ_INACTIVE); - vm_page_unlock(*(pa + i + j)); + for (j = i; j < i + count; j++) { + if (pa[j] != bogus_page) { + vm_page_lock(pa[j]); + vm_page_unwire(pa[j], PQ_INACTIVE); + vm_page_unlock(pa[j]); + } } VM_OBJECT_WUNLOCK(obj); - return EIO; + return (EIO); } KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", __func__, obj, pa[i])); SFSTAT_INC(sf_iocnt); SFSTAT_ADD(sf_pages_read, count); if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); /* * Restore the valid page pointers. They are already * unbusied, but still wired. */ for (j = i; j < i + count; j++) if (pa[j] == bogus_page) { pa[j] = vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))); KASSERT(pa[j], ("%s: page %p[%d] disappeared", __func__, pa, j)); } i += count; (*nios)++; } VM_OBJECT_WUNLOCK(obj); if (*nios == 0 && npages != 0) SFSTAT_INC(sf_noiocnt); return (0); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, &cap_send_rights, sock_fp, NULL, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); if (SOLISTENING(*so)) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; struct mbuf_ext_pgs *ext_pgs; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr; bool use_ext_pgs; obj = NULL; so = NULL; m = mh = NULL; sfs = NULL; hdrlen = sbytes = 0; softerr = 0; use_ext_pgs = false; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif SFSTAT_INC(sf_syscalls); SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; rem > 0; ) { struct sf_io *sfio; vm_page_t *pa; struct mbuf *mtail; int nios, space, npages, rhpages; mtail = NULL; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in * the socket buffer by the size of the header mbuf chain. * Clear hdr_uio here and hdrlen at the end of the first loop. */ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0); hdrlen = m_length(mh, &mhtail); space -= hdrlen; /* * If header consumed all the socket buffer space, * don't waste CPU cycles and jump to the end. */ if (space == 0) { sfio = NULL; nios = 0; goto prepend_header; } hdr_uio = NULL; } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } if (va.va_size != obj_size) { obj_size = va.va_size; rem = nbytes ? omin(nbytes + offset, obj_size) : obj_size; rem -= off; } } if (space > rem) space = rem; else if (space > PAGE_SIZE) { /* * Use page boundaries when possible for large * requests. */ if (off & PAGE_MASK) space -= (PAGE_SIZE - (off & PAGE_MASK)); space = trunc_page(space); if (off & PAGE_MASK) space += (PAGE_SIZE - (off & PAGE_MASK)); } npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); /* * Calculate maximum allowed number of pages for readahead * at this iteration. If SF_USER_READAHEAD was set, we don't * do any heuristics and use exactly the value supplied by * application. Otherwise, we allow readahead up to "rem". * If application wants more, let it be, but there is no * reason to go above MAXPHYS. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ if (flags & SF_USER_READAHEAD) { rhpages = SF_READAHEAD(flags); } else { rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; rhpages += SF_READAHEAD(flags); } rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); sfio = malloc(sizeof(struct sf_io) + npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->so = so; sfio->error = 0; error = sendfile_swapin(obj, sfio, &nios, off, space, npages, rhpages, flags); - if (error) { - free(sfio, M_TEMP); + if (error != 0) { if (vp != NULL) VOP_UNLOCK(vp, 0); + free(sfio, M_TEMP); goto done; } /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ pa = sfio->pa; /* * Use unmapped mbufs if enabled for TCP. Unmapped * bufs are restricted to TCP as that is what has been * tested. In particular, unmapped mbufs have not * been tested with UNIX-domain sockets. */ if (mb_use_ext_pgs && so->so_proto->pr_protocol == IPPROTO_TCP) { use_ext_pgs = true; max_pgs = MBUF_PEXT_MAX_PGS; /* Start at last index, to wrap on first use. */ ext_pgs_idx = max_pgs - 1; } for (int i = 0; i < npages; i++) { struct mbuf *m0; /* * If a page wasn't grabbed successfully, then * trim the array. Can happen only with SF_NODISKIO. */ if (pa[i] == NULL) { SFSTAT_INC(sf_busy); fixspace(npages, i, off, &space); npages = i; softerr = EBUSY; break; } if (use_ext_pgs) { off_t xfs; ext_pgs_idx++; if (ext_pgs_idx == max_pgs) { m0 = mb_alloc_ext_pgs(M_WAITOK, false, sendfile_free_mext_pg); if (flags & SF_NOCACHE) { m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE; /* * See comment below regarding * ignoring SF_NOCACHE for the * last page. */ if ((npages - i <= max_pgs) && ((off + space) & PAGE_MASK) && (rem > space || rhpages > 0)) m0->m_ext.ext_flags |= EXT_FLAG_CACHE_LAST; } if (sfs != NULL) { m0->m_ext.ext_flags |= EXT_FLAG_SYNC; m0->m_ext.ext_arg2 = sfs; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } ext_pgs = m0->m_ext.ext_pgs; if (i == 0) sfio->m = m0; ext_pgs_idx = 0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; ext_pgs->first_pg_off = vmoff(i, off) & PAGE_MASK; } if (nios) { mtail->m_flags |= M_NOTREADY; ext_pgs->nrdy++; } ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]); ext_pgs->npgs++; xfs = xfsize(i, npages, off, space); ext_pgs->last_pg_len = xfs; MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs); mtail->m_len += xfs; mtail->m_ext.ext_size += PAGE_SIZE; continue; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pa[i], m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); for (int j = i; j < npages; j++) { vm_page_lock(pa[j]); vm_page_unwire(pa[j], PQ_INACTIVE); vm_page_unlock(pa[j]); } if (m == NULL) softerr = ENOBUFS; fixspace(npages, i, off, &space); npages = i; break; } m0 = m_get(M_WAITOK, MT_DATA); m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_type = EXT_SFBUF; m0->m_ext.ext_flags = EXT_FLAG_EMBREF; m0->m_ext.ext_free = sendfile_free_mext; /* * SF_NOCACHE sets the page as being freed upon send. * However, we ignore it for the last page in 'space', * if the page is truncated, and we got more data to * send (rem > space), or if we have readahead * configured (rhpages > 0). */ if ((flags & SF_NOCACHE) && (i != npages - 1 || !((off + space) & PAGE_MASK) || !(rem > space || rhpages > 0))) m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE; if (sfs != NULL) { m0->m_ext.ext_flags |= EXT_FLAG_SYNC; m0->m_ext.ext_arg2 = sfs; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } m0->m_ext.ext_count = 1; m0->m_flags |= (M_EXT | M_RDONLY); if (nios) m0->m_flags |= M_NOTREADY; m0->m_data = (char *)sf_buf_kva(sf) + (vmoff(i, off) & PAGE_MASK); m0->m_len = xfsize(i, npages, off, space); if (i == 0) sfio->m = m0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; } if (vp != NULL) VOP_UNLOCK(vp, 0); /* Keep track of bytes processed. */ off += space; rem -= space; /* Prepend header, if any. */ if (hdrlen) { prepend_header: mhtail->m_next = m; m = mh; mh = NULL; } if (m == NULL) { KASSERT(softerr, ("%s: m NULL, no error", __func__)); error = softerr; free(sfio, M_TEMP); goto done; } /* Add the buffer chain to the socket buffer. */ KASSERT(m_length(m, NULL) == space + hdrlen, ("%s: mlen %u space %d hdrlen %d", __func__, m_length(m, NULL), space, hdrlen)); CURVNET_SET(so->so_vnet); if (nios == 0) { /* * If sendfile_swapin() didn't initiate any I/Os, * which happens if all data is cached in VM, then * we can send data right now without the * PRUS_NOTREADY flag. */ free(sfio, M_TEMP); error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); } else { sfio->npages = npages; soref(so); error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, 0); } CURVNET_RESTORE(); m = NULL; /* pru_send always consumes */ if (error) goto done; sbytes += space + hdrlen; if (hdrlen) hdrlen = 0; if (softerr) { error = softerr; goto done; } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (mh) m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_TEMP); } if (error == ERESTART) error = EINTR; return (error); } static int sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); sbytes = 0; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; #ifdef COMPAT_FREEBSD4 /* * In FreeBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } #endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0) goto out; error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (sendfile(td, uap, 0)); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ Index: projects/fuse2/sys/kern/kern_sig.c =================================================================== --- projects/fuse2/sys/kern/kern_sig.c (revision 350434) +++ projects/fuse2/sys/kern/kern_sig.c (revision 350435) @@ -1,3858 +1,3859 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ #define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); timespecadd(&rts, timeout, &ets); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } timespecsub(&ets, &rts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; int err; int ret; ret = ESRCH; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { continue; } PROC_LOCK(p); err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->pid > 0) { /* kill single process */ if ((p = pfind_any(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) pksignal(p, uap->signum, &ksi); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, prop); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ wakeup_swapper = 0; PROC_SLOCK(p); thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) wakeup_swapper = sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop; int wakeup_swapper; wakeup_swapper = 0; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } out: PROC_SUNLOCK(p); thread_unlock(td); if (wakeup_swapper) kick_proc0(); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); } else if (!TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; int prop; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; prop = sigprop(td->td_xsig); td2 = sigtd(p, td->td_xsig, prop); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, 0); signotify(td); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; ksiginfo_t ksi; int prop, sig, traced; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ sig = SIGSTOP; td->td_dbgflags |= TDB_FSTP; } else { sig = sig_ffs(&sigpending); } if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) continue; /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); continue; } /* * If the traced bit got turned off, requeue * the signal and go back up to the top to * rescan signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); continue; } } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); return (-1); } mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); next:; } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if ((p->p_stops & S_SIG) != 0) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp, 0); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vnode_close_locked(td, oldvp); oldvp = vp; lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vnode_close_locked(td, oldvp); } else { nextvp = oldvp; } } else { vnode_close_locked(td, oldvp); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp, 0); error = EFAULT; goto out; } VOP_UNLOCK(vp, 0); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if (kern___getcwd(td, fullpath, UIO_SYSSPACE, MAXPATHLEN, MAXPATHLEN) != 0) { free(fullpath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(fullpath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } Index: projects/fuse2/sys/kern/sys_process.c =================================================================== --- projects/fuse2/sys/kern/sys_process.c (revision 350434) +++ projects/fuse2/sys/kern/sys_process.c (revision 350435) @@ -1,1557 +1,1558 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1994, Sean Eric Fagan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include struct ptrace_io_desc32 { int piod_op; uint32_t piod_offs; uint32_t piod_addr; uint32_t piod_len; }; struct ptrace_sc_ret32 { uint32_t sr_retval[2]; int sr_error; }; struct ptrace_vm_entry32 { int pve_entry; int pve_timestamp; uint32_t pve_start; uint32_t pve_end; uint32_t pve_offset; u_int pve_prot; u_int pve_pathlen; int32_t pve_fileid; u_int pve_fsid; uint32_t pve_path; }; #endif /* * Functions implemented using PROC_ACTION(): * * proc_read_regs(proc, regs) * Get the current user-visible register set from the process * and copy it into the regs structure (). * The process is stopped at the time read_regs is called. * * proc_write_regs(proc, regs) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * Depending on the architecture this may have fix-up work to do, * especially if the IAR or PCW are modified. * The process is stopped at the time write_regs is called. * * proc_read_fpregs, proc_write_fpregs * deal with the floating point register set, otherwise as above. * * proc_read_dbregs, proc_write_dbregs * deal with the processor debug register set, otherwise as above. * * proc_sstep(proc) * Arrange for the process to trap after executing a single instruction. */ #define PROC_ACTION(action) do { \ int error; \ \ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \ if ((td->td_proc->p_flag & P_INMEM) == 0) \ error = EIO; \ else \ error = (action); \ return (error); \ } while(0) int proc_read_regs(struct thread *td, struct reg *regs) { PROC_ACTION(fill_regs(td, regs)); } int proc_write_regs(struct thread *td, struct reg *regs) { PROC_ACTION(set_regs(td, regs)); } int proc_read_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(fill_dbregs(td, dbregs)); } int proc_write_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(set_dbregs(td, dbregs)); } /* * Ptrace doesn't support fpregs at all, and there are no security holes * or translations for fpregs, so we can just copy them. */ int proc_read_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(fill_fpregs(td, fpregs)); } int proc_write_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(set_fpregs(td, fpregs)); } #ifdef COMPAT_FREEBSD32 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */ int proc_read_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(fill_regs32(td, regs32)); } int proc_write_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(set_regs32(td, regs32)); } int proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(fill_dbregs32(td, dbregs32)); } int proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(set_dbregs32(td, dbregs32)); } int proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(fill_fpregs32(td, fpregs32)); } int proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(set_fpregs32(td, fpregs32)); } #endif int proc_sstep(struct thread *td) { PROC_ACTION(ptrace_single_step(td)); } int proc_rwmem(struct proc *p, struct uio *uio) { vm_map_t map; vm_offset_t pageno; /* page number */ vm_prot_t reqprot; int error, fault_flags, page_offset, writing; /* * Assert that someone has locked this vmspace. (Should be * curthread but we can't assert that.) This keeps the process * from exiting out from under us until this operation completes. */ PROC_ASSERT_HELD(p); PROC_LOCK_ASSERT(p, MA_NOTOWNED); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * If we are writing, then we request vm_fault() to create a private * copy of each page. Since these copies will not be writeable by the * process, we must explicity request that they be dirtied. */ writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ; fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_offset_t uva; u_int len; vm_page_t m; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault and hold the page on behalf of the process. */ error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); if (error != KERN_SUCCESS) { if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EFAULT; break; } /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); /* Make the I-cache coherent for breakpoints. */ if (writing && error == 0) { vm_map_lock_read(map); if (vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_EXECUTE)) vm_sync_icache(map, uva, len); vm_map_unlock_read(map); } /* * Release the page. */ vm_page_lock(m); if (vm_page_unwire(m, PQ_ACTIVE) && m->object == NULL) vm_page_free(m); vm_page_unlock(m); } while (error == 0 && uio->uio_resid > 0); return (error); } static ssize_t proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len, enum uio_rw rw) { struct iovec iov; struct uio uio; ssize_t slen; MPASS(len < SSIZE_MAX); slen = (ssize_t)len; iov.iov_base = (caddr_t)buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = va; uio.uio_resid = slen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = rw; uio.uio_td = td; proc_rwmem(p, &uio); if (uio.uio_resid == slen) return (-1); return (slen - uio.uio_resid); } ssize_t proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_READ)); } ssize_t proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_WRITE)); } static int ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve) { struct vattr vattr; vm_map_t map; vm_map_entry_t entry; vm_object_t obj, tobj, lobj; struct vmspace *vm; struct vnode *vp; char *freepath, *fullpath; u_int pathlen; int error, index; error = 0; obj = NULL; vm = vmspace_acquire_ref(p); map = &vm->vm_map; vm_map_lock_read(map); do { entry = map->header.next; index = 0; while (index < pve->pve_entry && entry != &map->header) { entry = entry->next; index++; } if (index != pve->pve_entry) { error = EINVAL; break; } KASSERT((map->header.eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap in map header")); while ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { entry = entry->next; index++; } if (entry == &map->header) { error = ENOENT; break; } /* We got an entry. */ pve->pve_entry = index + 1; pve->pve_timestamp = map->timestamp; pve->pve_start = entry->start; pve->pve_end = entry->end - 1; pve->pve_offset = entry->offset; pve->pve_prot = entry->protection; /* Backing object's path needed? */ if (pve->pve_pathlen == 0) break; pathlen = pve->pve_pathlen; pve->pve_pathlen = 0; obj = entry->object.vm_object; if (obj != NULL) VM_OBJECT_RLOCK(obj); } while (0); vm_map_unlock_read(map); pve->pve_fsid = VNOVAL; pve->pve_fileid = VNOVAL; if (error == 0 && obj != NULL) { lobj = obj; for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; pve->pve_offset += tobj->backing_object_offset; } vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { freepath = NULL; fullpath = NULL; vn_fullpath(td, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) { pve->pve_fileid = vattr.va_fileid; pve->pve_fsid = vattr.va_fsid; } vput(vp); if (fullpath != NULL) { pve->pve_pathlen = strlen(fullpath) + 1; if (pve->pve_pathlen <= pathlen) { error = copyout(fullpath, pve->pve_path, pve->pve_pathlen); } else error = ENAMETOOLONG; } if (freepath != NULL) free(freepath, M_TEMP); } } vmspace_free(vm); if (error == 0) CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p", p->p_pid, pve->pve_entry, pve->pve_start); return (error); } #ifdef COMPAT_FREEBSD32 static int ptrace_vm_entry32(struct thread *td, struct proc *p, struct ptrace_vm_entry32 *pve32) { struct ptrace_vm_entry pve; int error; pve.pve_entry = pve32->pve_entry; pve.pve_pathlen = pve32->pve_pathlen; pve.pve_path = (void *)(uintptr_t)pve32->pve_path; error = ptrace_vm_entry(td, p, &pve); if (error == 0) { pve32->pve_entry = pve.pve_entry; pve32->pve_timestamp = pve.pve_timestamp; pve32->pve_start = pve.pve_start; pve32->pve_end = pve.pve_end; pve32->pve_offset = pve.pve_offset; pve32->pve_prot = pve.pve_prot; pve32->pve_fileid = pve.pve_fileid; pve32->pve_fsid = pve.pve_fsid; } pve32->pve_pathlen = pve.pve_pathlen; return (error); } static void ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl, struct ptrace_lwpinfo32 *pl32) { bzero(pl32, sizeof(*pl32)); pl32->pl_lwpid = pl->pl_lwpid; pl32->pl_event = pl->pl_event; pl32->pl_flags = pl->pl_flags; pl32->pl_sigmask = pl->pl_sigmask; pl32->pl_siglist = pl->pl_siglist; siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo); strcpy(pl32->pl_tdname, pl->pl_tdname); pl32->pl_child_pid = pl->pl_child_pid; pl32->pl_syscall_code = pl->pl_syscall_code; pl32->pl_syscall_narg = pl->pl_syscall_narg; } static void ptrace_sc_ret_to32(const struct ptrace_sc_ret *psr, struct ptrace_sc_ret32 *psr32) { bzero(psr32, sizeof(*psr32)); psr32->sr_retval[0] = psr->sr_retval[0]; psr32->sr_retval[1] = psr->sr_retval[1]; psr32->sr_error = psr->sr_error; } #endif /* COMPAT_FREEBSD32 */ /* * Process debugging system call. */ #ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; #endif #ifdef COMPAT_FREEBSD32 /* * This CPP subterfuge is to try and reduce the number of ifdefs in * the body of the code. * COPYIN(uap->addr, &r.reg, sizeof r.reg); * becomes either: * copyin(uap->addr, &r.reg, sizeof r.reg); * or * copyin(uap->addr, &r.reg32, sizeof r.reg32); * .. except this is done at runtime. */ #define BZERO(a, s) wrap32 ? \ bzero(a ## 32, s ## 32) : \ bzero(a, s) #define COPYIN(u, k, s) wrap32 ? \ copyin(u, k ## 32, s ## 32) : \ copyin(u, k, s) #define COPYOUT(k, u, s) wrap32 ? \ copyout(k ## 32, u, s ## 32) : \ copyout(k, u, s) #else #define BZERO(a, s) bzero(a, s) #define COPYIN(u, k, s) copyin(u, k, s) #define COPYOUT(k, u, s) copyout(k, u, s) #endif int sys_ptrace(struct thread *td, struct ptrace_args *uap) { /* * XXX this obfuscation is to reduce stack usage, but the register * structs may be too large to put on the stack anyway. */ union { struct ptrace_io_desc piod; struct ptrace_lwpinfo pl; struct ptrace_vm_entry pve; struct dbreg dbreg; struct fpreg fpreg; struct reg reg; #ifdef COMPAT_FREEBSD32 struct dbreg32 dbreg32; struct fpreg32 fpreg32; struct reg32 reg32; struct ptrace_io_desc32 piod32; struct ptrace_lwpinfo32 pl32; struct ptrace_vm_entry32 pve32; #endif char args[sizeof(td->td_sa.args)]; struct ptrace_sc_ret psr; int ptevents; } r; void *addr; int error = 0; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; if (SV_CURPROC_FLAG(SV_ILP32)) wrap32 = 1; #endif AUDIT_ARG_PID(uap->pid); AUDIT_ARG_CMD(uap->req); AUDIT_ARG_VALUE(uap->data); addr = &r; switch (uap->req) { case PT_GET_EVENT_MASK: case PT_LWPINFO: case PT_GET_SC_ARGS: case PT_GET_SC_RET: break; case PT_GETREGS: BZERO(&r.reg, sizeof r.reg); break; case PT_GETFPREGS: BZERO(&r.fpreg, sizeof r.fpreg); break; case PT_GETDBREGS: BZERO(&r.dbreg, sizeof r.dbreg); break; case PT_SETREGS: error = COPYIN(uap->addr, &r.reg, sizeof r.reg); break; case PT_SETFPREGS: error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg); break; case PT_SETDBREGS: error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg); break; case PT_SET_EVENT_MASK: if (uap->data != sizeof(r.ptevents)) error = EINVAL; else error = copyin(uap->addr, &r.ptevents, uap->data); break; case PT_IO: error = COPYIN(uap->addr, &r.piod, sizeof r.piod); break; case PT_VM_ENTRY: error = COPYIN(uap->addr, &r.pve, sizeof r.pve); break; default: addr = uap->addr; break; } if (error) return (error); error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data); if (error) return (error); switch (uap->req) { case PT_VM_ENTRY: error = COPYOUT(&r.pve, uap->addr, sizeof r.pve); break; case PT_IO: error = COPYOUT(&r.piod, uap->addr, sizeof r.piod); break; case PT_GETREGS: error = COPYOUT(&r.reg, uap->addr, sizeof r.reg); break; case PT_GETFPREGS: error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg); break; case PT_GETDBREGS: error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg); break; case PT_GET_EVENT_MASK: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.ptevents, uap->addr, uap->data); break; case PT_LWPINFO: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.pl, uap->addr, uap->data); break; case PT_GET_SC_ARGS: error = copyout(r.args, uap->addr, MIN(uap->data, sizeof(r.args))); break; case PT_GET_SC_RET: error = copyout(&r.psr, uap->addr, MIN(uap->data, sizeof(r.psr))); break; } return (error); } #undef COPYIN #undef COPYOUT #undef BZERO #ifdef COMPAT_FREEBSD32 /* * PROC_READ(regs, td2, addr); * becomes either: * proc_read_regs(td2, addr); * or * proc_read_regs32(td2, addr); * .. except this is done at runtime. There is an additional * complication in that PROC_WRITE disallows 32 bit consumers * from writing to 64 bit address space targets. */ #define PROC_READ(w, t, a) wrap32 ? \ proc_read_ ## w ## 32(t, a) : \ proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) wrap32 ? \ (safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \ proc_write_ ## w (t, a) #else #define PROC_READ(w, t, a) proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) proc_write_ ## w (t, a) #endif void proc_set_traced(struct proc *p, bool stop) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) p->p_flag2 |= P2_PTRACE_FSTP; p->p_ptevents = PTRACE_DEFAULT; } int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) { struct iovec iov; struct uio uio; struct proc *curp, *p, *pp; struct thread *td2 = NULL, *td3; struct ptrace_io_desc *piod = NULL; struct ptrace_lwpinfo *pl; struct ptrace_sc_ret *psr; int error, num, tmp; int proctree_locked = 0; lwpid_t tid = 0, *buf; #ifdef COMPAT_FREEBSD32 int wrap32 = 0, safe = 0; struct ptrace_io_desc32 *piod32 = NULL; struct ptrace_lwpinfo32 *pl32 = NULL; struct ptrace_sc_ret32 *psr32 = NULL; union { struct ptrace_lwpinfo pl; struct ptrace_sc_ret psr; } r; #endif curp = td->td_proc; /* Lock proctree before locking the process. */ switch (req) { case PT_TRACE_ME: case PT_ATTACH: case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_FOLLOW_FORK: case PT_LWP_EVENTS: case PT_GET_EVENT_MASK: case PT_SET_EVENT_MASK: case PT_DETACH: case PT_GET_SC_ARGS: sx_xlock(&proctree_lock); proctree_locked = 1; break; default: break; } if (req == PT_TRACE_ME) { p = td->td_proc; PROC_LOCK(p); } else { if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } p = td2->td_proc; tid = pid; pid = p->p_pid; } } AUDIT_ARG_PROCESS(p); if ((p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto fail; } if ((error = p_cansee(td, p)) != 0) goto fail; if ((error = p_candebug(td, p)) != 0) goto fail; /* * System processes can't be debugged. */ if ((p->p_flag & P_SYSTEM) != 0) { error = EINVAL; goto fail; } if (tid == 0) { if ((p->p_flag & P_STOPPED_TRACE) != 0) { KASSERT(p->p_xthread != NULL, ("NULL p_xthread")); td2 = p->p_xthread; } else { td2 = FIRST_THREAD_IN_PROC(p); } tid = td2->td_tid; } #ifdef COMPAT_FREEBSD32 /* * Test if we're a 32 bit client and what the target is. * Set the wrap controls accordingly. */ if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32)) safe = 1; wrap32 = 1; } #endif /* * Permissions check */ switch (req) { case PT_TRACE_ME: /* * Always legal, when there is a parent process which * could trace us. Otherwise, reject. */ if ((p->p_flag & P_TRACED) != 0) { error = EBUSY; goto fail; } if (p->p_pptr == initproc) { error = EPERM; goto fail; } break; case PT_ATTACH: /* Self */ if (p == td->td_proc) { error = EINVAL; goto fail; } /* Already traced */ if (p->p_flag & P_TRACED) { error = EBUSY; goto fail; } /* Can't trace an ancestor if you're being traced. */ if (curp->p_flag & P_TRACED) { for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) { if (pp == p) { error = EINVAL; goto fail; } } } /* OK */ break; case PT_CLEARSTEP: /* Allow thread to clear single step for itself */ if (td->td_tid == tid) break; /* FALLTHROUGH */ default: /* not being traced... */ if ((p->p_flag & P_TRACED) == 0) { error = EPERM; goto fail; } /* not being traced by YOU */ if (p->p_pptr != td->td_proc) { error = EBUSY; goto fail; } /* not currently stopped */ if ((p->p_flag & P_STOPPED_TRACE) == 0 || p->p_suspcount != p->p_numthreads || (p->p_flag & P_WAITED) == 0) { error = EBUSY; goto fail; } /* OK */ break; } /* Keep this process around until we finish this request. */ _PHOLD(p); #ifdef FIX_SSTEP /* * Single step fixup ala procfs */ FIX_SSTEP(td2); #endif /* * Actually do the requests */ td->td_retval[0] = 0; switch (req) { case PT_TRACE_ME: /* set my trace flag and "owner" so it can read/write me */ proc_set_traced(p, false); if (p->p_flag & P_PPWAIT) p->p_flag |= P_PPTRACE; CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid); break; case PT_ATTACH: /* security check done above */ /* * It would be nice if the tracing relationship was separate * from the parent relationship but that would require * another set of links in the proc struct or for "wait" * to scan the entire proc table. To make life easier, * we just re-parent the process we're trying to trace. * The old parent is remembered so we can put things back * on a "detach". */ proc_set_traced(p, true); if (p->p_pptr != td->td_proc) { proc_reparent(p, td->td_proc, false); } CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); sx_xunlock(&proctree_lock); proctree_locked = 0; MPASS(p->p_xthread == NULL); MPASS((p->p_flag & P_STOPPED_TRACE) == 0); /* * If already stopped due to a stop signal, clear the * existing stop before triggering a traced SIGSTOP. */ if ((p->p_flag & P_STOPPED_SIG) != 0) { PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); } kern_psignal(p, SIGSTOP); break; case PT_CLEARSTEP: CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_clear_single_step(td2); break; case PT_SETSTEP: CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_single_step(td2); break; case PT_SUSPEND: CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_SUSPEND; thread_lock(td2); td2->td_flags |= TDF_NEEDSUSPCHK; thread_unlock(td2); break; case PT_RESUME: CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags &= ~TDB_SUSPEND; break; case PT_FOLLOW_FORK: CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_FORK; else p->p_ptevents &= ~PTRACE_FORK; break; case PT_LWP_EVENTS: CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_LWP; else p->p_ptevents &= ~PTRACE_LWP; break; case PT_GET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid, p->p_ptevents); *(int *)addr = p->p_ptevents; break; case PT_SET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } tmp = *(int *)addr; if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX | PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) { error = EINVAL; break; } CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x", p->p_pid, p->p_ptevents, tmp); p->p_ptevents = tmp; break; case PT_GET_SC_ARGS: CTR1(KTR_PTRACE, "PT_GET_SC_ARGS: pid %d", p->p_pid); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) == 0 #ifdef COMPAT_FREEBSD32 || (wrap32 && !safe) #endif ) { error = EINVAL; break; } bzero(addr, sizeof(td2->td_sa.args)); #ifdef COMPAT_FREEBSD32 if (wrap32) for (num = 0; num < nitems(td2->td_sa.args); num++) ((uint32_t *)addr)[num] = (uint32_t) td2->td_sa.args[num]; else #endif bcopy(td2->td_sa.args, addr, td2->td_sa.narg * sizeof(register_t)); break; case PT_GET_SC_RET: if ((td2->td_dbgflags & (TDB_SCX)) == 0 #ifdef COMPAT_FREEBSD32 || (wrap32 && !safe) #endif ) { error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { psr = &r.psr; psr32 = addr; } else #endif psr = addr; bzero(psr, sizeof(*psr)); psr->sr_error = td2->td_errno; if (psr->sr_error == 0) { psr->sr_retval[0] = td2->td_retval[0]; psr->sr_retval[1] = td2->td_retval[1]; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_sc_ret_to32(psr, psr32); #endif CTR4(KTR_PTRACE, "PT_GET_SC_RET: pid %d error %d retval %#lx,%#lx", p->p_pid, psr->sr_error, psr->sr_retval[0], psr->sr_retval[1]); break; case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_DETACH: /* Zero means do not send any signal */ if (data < 0 || data > _SIG_MAXSIG) { error = EINVAL; break; } switch (req) { case PT_STEP: CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d", td2->td_tid, p->p_pid, data); error = ptrace_single_step(td2); if (error) goto out; break; case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: if (addr != (void *)1) { error = ptrace_set_pc(td2, (u_long)(uintfptr_t)addr); if (error) goto out; } switch (req) { case PT_TO_SCE: p->p_ptevents |= PTRACE_SCE; CTR4(KTR_PTRACE, "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_TO_SCX: p->p_ptevents |= PTRACE_SCX; CTR4(KTR_PTRACE, "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_SYSCALL: p->p_ptevents |= PTRACE_SYSCALL; CTR4(KTR_PTRACE, "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_CONTINUE: CTR3(KTR_PTRACE, "PT_CONTINUE: pid %d, PC = %#lx, sig = %d", p->p_pid, (u_long)(uintfptr_t)addr, data); break; } break; case PT_DETACH: /* * Reset the process parent. * * NB: This clears P_TRACED before reparenting * a detached process back to its original * parent. Otherwise the debugee will be set * as an orphan of the debugger. */ p->p_flag &= ~(P_TRACED | P_WAITED); if (p->p_oppid != p->p_pptr->p_pid) { PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); pp = proc_realparent(p); proc_reparent(p, pp, false); if (pp == initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", p->p_pid, pp->p_pid, data); } else CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d", p->p_pid, data); p->p_ptevents = 0; FOREACH_THREAD_IN_PROC(p, td3) { if ((td3->td_dbgflags & TDB_FSTP) != 0) { sigqueue_delete(&td3->td_sigqueue, SIGSTOP); } td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP | TDB_SUSPEND); } if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) { sigqueue_delete(&p->p_sigqueue, SIGSTOP); p->p_flag2 &= ~P2_PTRACE_FSTP; } /* should we send SIGCHLD? */ /* childproc_continued(p); */ break; } sx_xunlock(&proctree_lock); proctree_locked = 0; sendsig: MPASS(proctree_locked == 0); /* * Clear the pending event for the thread that just * reported its event (p_xthread). This may not be * the thread passed to PT_CONTINUE, PT_STEP, etc. if * the debugger is resuming a different thread. * * Deliver any pending signal via the reporting thread. */ MPASS(p->p_xthread != NULL); p->p_xthread->td_dbgflags &= ~TDB_XSIG; p->p_xthread->td_xsig = data; p->p_xthread = NULL; p->p_xsig = data; /* * P_WKILLED is insurance that a PT_KILL/SIGKILL * always works immediately, even if another thread is * unsuspended first and attempts to handle a * different signal or if the POSIX.1b style signal * queue cannot accommodate any new signals. */ if (data == SIGKILL) proc_wkilled(p); /* * Unsuspend all threads. To leave a thread * suspended, use PT_SUSPEND to suspend it before * continuing the process. */ PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_TRACE | P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); break; case PT_WRITE_I: case PT_WRITE_D: td2->td_dbgflags |= TDB_USERWR; PROC_UNLOCK(p); error = 0; if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x", p->p_pid, addr, data); PROC_LOCK(p); break; case PT_READ_I: case PT_READ_D: PROC_UNLOCK(p); error = tmp = 0; if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x", p->p_pid, addr, tmp); td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_IO: #ifdef COMPAT_FREEBSD32 if (wrap32) { piod32 = addr; iov.iov_base = (void *)(uintptr_t)piod32->piod_addr; iov.iov_len = piod32->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs; uio.uio_resid = piod32->piod_len; } else #endif { piod = addr; iov.iov_base = piod->piod_addr; iov.iov_len = piod->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs; uio.uio_resid = piod->piod_len; } uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; #ifdef COMPAT_FREEBSD32 tmp = wrap32 ? piod32->piod_op : piod->piod_op; #else tmp = piod->piod_op; #endif switch (tmp) { case PIOD_READ_D: case PIOD_READ_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); uio.uio_rw = UIO_READ; break; case PIOD_WRITE_D: case PIOD_WRITE_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); td2->td_dbgflags |= TDB_USERWR; uio.uio_rw = UIO_WRITE; break; default: error = EINVAL; goto out; } PROC_UNLOCK(p); error = proc_rwmem(p, &uio); #ifdef COMPAT_FREEBSD32 if (wrap32) piod32->piod_len -= uio.uio_resid; else #endif piod->piod_len -= uio.uio_resid; PROC_LOCK(p); break; case PT_KILL: CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid); data = SIGKILL; goto sendsig; /* in PT_CONTINUE above */ case PT_SETREGS: CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(regs, td2, addr); break; case PT_GETREGS: CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(regs, td2, addr); break; case PT_SETFPREGS: CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(fpregs, td2, addr); break; case PT_GETFPREGS: CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(fpregs, td2, addr); break; case PT_SETDBREGS: CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(dbregs, td2, addr); break; case PT_GETDBREGS: CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(dbregs, td2, addr); break; case PT_LWPINFO: if (data <= 0 || #ifdef COMPAT_FREEBSD32 (!wrap32 && data > sizeof(*pl)) || (wrap32 && data > sizeof(*pl32))) { #else data > sizeof(*pl)) { #endif error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { pl = &r.pl; pl32 = addr; } else #endif pl = addr; bzero(pl, sizeof(*pl)); pl->pl_lwpid = td2->td_tid; pl->pl_event = PL_EVENT_NONE; pl->pl_flags = 0; if (td2->td_dbgflags & TDB_XSIG) { pl->pl_event = PL_EVENT_SIGNAL; if (td2->td_si.si_signo != 0 && #ifdef COMPAT_FREEBSD32 ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo)) || (wrap32 && data >= offsetof(struct ptrace_lwpinfo32, pl_siginfo) + sizeof(struct siginfo32))) #else data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo) #endif ){ pl->pl_flags |= PL_FLAG_SI; pl->pl_siginfo = td2->td_si; } } if (td2->td_dbgflags & TDB_SCE) pl->pl_flags |= PL_FLAG_SCE; else if (td2->td_dbgflags & TDB_SCX) pl->pl_flags |= PL_FLAG_SCX; if (td2->td_dbgflags & TDB_EXEC) pl->pl_flags |= PL_FLAG_EXEC; if (td2->td_dbgflags & TDB_FORK) { pl->pl_flags |= PL_FLAG_FORKED; pl->pl_child_pid = td2->td_dbg_forked; if (td2->td_dbgflags & TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORKED; } else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) == TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORK_DONE; if (td2->td_dbgflags & TDB_CHILD) pl->pl_flags |= PL_FLAG_CHILD; if (td2->td_dbgflags & TDB_BORN) pl->pl_flags |= PL_FLAG_BORN; if (td2->td_dbgflags & TDB_EXIT) pl->pl_flags |= PL_FLAG_EXITED; pl->pl_sigmask = td2->td_sigmask; pl->pl_siglist = td2->td_siglist; strcpy(pl->pl_tdname, td2->td_name); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) { pl->pl_syscall_code = td2->td_sa.code; pl->pl_syscall_narg = td2->td_sa.narg; } else { pl->pl_syscall_code = 0; pl->pl_syscall_narg = 0; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_lwpinfo_to32(pl, pl32); #endif CTR6(KTR_PTRACE, "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d", td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags, pl->pl_child_pid, pl->pl_syscall_code); break; case PT_GETNUMLWPS: CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid, p->p_numthreads); td->td_retval[0] = p->p_numthreads; break; case PT_GETLWPLIST: CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d", p->p_pid, data, p->p_numthreads); if (data <= 0) { error = EINVAL; break; } num = imin(p->p_numthreads, data); PROC_UNLOCK(p); buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); if (!error) td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_VM_TIMESTAMP: CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d", p->p_pid, p->p_vmspace->vm_map.timestamp); td->td_retval[0] = p->p_vmspace->vm_map.timestamp; break; case PT_VM_ENTRY: PROC_UNLOCK(p); #ifdef COMPAT_FREEBSD32 if (wrap32) error = ptrace_vm_entry32(td, p, addr); else #endif error = ptrace_vm_entry(td, p, addr); PROC_LOCK(p); break; default: #ifdef __HAVE_PTRACE_MACHDEP if (req >= PT_FIRSTMACH) { PROC_UNLOCK(p); error = cpu_ptrace(td2, req, addr, data); PROC_LOCK(p); } else #endif /* Unknown request. */ error = EINVAL; break; } out: /* Drop our hold on this process now that the request has completed. */ _PRELE(p); fail: PROC_UNLOCK(p); if (proctree_locked) sx_xunlock(&proctree_lock); return (error); } #undef PROC_READ #undef PROC_WRITE /* * Stop a process because of a debugging event; * stay stopped until p->p_step is cleared * (cleared by PIOCCONT in procfs). */ void stopevent(struct proc *p, unsigned int event, unsigned int val) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_step = 1; CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event, val); do { if (event != S_EXIT) p->p_xsig = val; p->p_xthread = NULL; p->p_stype = event; /* Which event caused the stop? */ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0); } while (p->p_step); } Index: projects/fuse2/sys/kern/uipc_shm.c =================================================================== --- projects/fuse2/sys/kern/uipc_shm.c (revision 350434) +++ projects/fuse2/sys/kern/uipc_shm.c (revision 350435) @@ -1,1180 +1,1181 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of * memory that user can consume for anonymous objects, including * shared. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr64 shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_ioctl_t shm_ioctl; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = shm_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(m); if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); vm_page_xunbusy(m); } vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unwire(m, PQ_ACTIVE); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { switch (com) { case FIONBIO: case FIOASYNC: /* * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, * just like it would on an unlinked regular file */ return (0); default: return (ENOTTY); } } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = IDX_TO_OFF(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; shmfd->shm_ino = alloc_unr64(&shm_ino_unr); refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; shmfd = fp->f_data; maxprot = VM_PROT_NONE; /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; /* Don't permit shared writable mappings on read-only descriptors. */ if ((flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - objsize) return (EINVAL); #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) return (error); #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, FALSE, td); if (error != 0) vm_object_deallocate(shmfd->shm_object); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) { const char *path, *pr_path; size_t pr_pathlen; bool visible; sx_assert(&shm_dict_lock, SA_LOCKED); kif->kf_type = KF_TYPE_SHM; kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); visible = strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/'; if (list && !visible) return (EPERM); if (visible) path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } } return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp __unused) { int res; sx_slock(&shm_dict_lock); res = shm_fill_kinfo_locked(fp->f_data, kif, false); sx_sunlock(&shm_dict_lock); return (res); } static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { struct shm_mapping *shmm; struct sbuf sb; struct kinfo_file kif; u_long i; ssize_t curlen; int error, error2; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); curlen = 0; error = 0; sx_slock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { error = shm_fill_kinfo_locked(shmm->sm_shmfd, &kif, true); if (error == EPERM) continue; if (error != 0) break; pack_kinfo(&kif); if (req->oldptr != NULL && kif.kf_structsize + curlen > req->oldlen) break; error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 0 : ENOMEM; if (error != 0) break; curlen += kif.kf_structsize; } } sx_sunlock(&shm_dict_lock); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); Index: projects/fuse2/sys/mips/broadcom/bhnd_nexus.c =================================================================== --- projects/fuse2/sys/mips/broadcom/bhnd_nexus.c (revision 350434) +++ projects/fuse2/sys/mips/broadcom/bhnd_nexus.c (revision 350435) @@ -1,281 +1,282 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * Copyright (c) 2017 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Landon Fuller * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); /* * bhnd(4) driver mix-in providing shared common methods for * bhnd bus devices attached via a MIPS root nexus. */ #include #include #include #include +#include #include #include #include #include #include #include #include #include #include "bcm_machdep.h" #include "bcm_mipsvar.h" #include "bhnd_nexusvar.h" /** * Default bhnd_nexus implementation of BHND_BUS_GET_SERVICE_REGISTRY(). */ static struct bhnd_service_registry * bhnd_nexus_get_service_registry(device_t dev, device_t child) { struct bcm_platform *bp = bcm_get_platform(); return (&bp->services); } /** * Default bhnd_nexus implementation of BHND_BUS_ACTIVATE_RESOURCE(). */ static int bhnd_nexus_activate_resource(device_t dev, device_t child, int type, int rid, struct bhnd_resource *r) { int error; /* Always direct */ if ((error = bus_activate_resource(child, type, rid, r->res))) return (error); r->direct = true; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_DEACTIVATE_RESOURCE(). */ static int bhnd_nexus_deactivate_resource(device_t dev, device_t child, int type, int rid, struct bhnd_resource *r) { int error; /* Always direct */ KASSERT(r->direct, ("indirect resource delegated to bhnd_nexus\n")); if ((error = bus_deactivate_resource(child, type, rid, r->res))) return (error); r->direct = false; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_IS_HW_DISABLED(). */ static bool bhnd_nexus_is_hw_disabled(device_t dev, device_t child) { struct bcm_platform *bp; struct bhnd_chipid *cid; bp = bcm_get_platform(); cid = &bp->cid; /* The BCM4706 low-cost package leaves secondary GMAC cores * floating */ if (cid->chip_id == BHND_CHIPID_BCM4706 && cid->chip_pkg == BHND_PKGID_BCM4706L && bhnd_get_device(child) == BHND_COREID_4706_GMAC && bhnd_get_core_unit(child) != 0) { return (true); } return (false); } /** * Default bhnd_nexus implementation of BHND_BUS_AGET_ATTACH_TYPE(). */ static bhnd_attach_type bhnd_nexus_get_attach_type(device_t dev, device_t child) { return (BHND_ATTACH_NATIVE); } /** * Default bhnd_nexus implementation of BHND_BUS_GET_CHIPID(). */ static const struct bhnd_chipid * bhnd_nexus_get_chipid(device_t dev, device_t child) { return (&bcm_get_platform()->cid); } /** * Default bhnd_nexus implementation of BHND_BUS_READ_BOARD_INFO(). */ static int bhnd_nexus_read_board_info(device_t dev, device_t child, struct bhnd_board_info *info) { int error; /* Initialize with NVRAM-derived values */ if ((error = bhnd_bus_generic_read_board_info(dev, child, info))) return (error); /* The board vendor should default to PCI_VENDOR_BROADCOM if not * otherwise specified */ if (info->board_vendor == 0) info->board_vendor = PCI_VENDOR_BROADCOM; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_MAP_INTR(). */ static int bhnd_nexus_map_intr(device_t dev, device_t child, u_int intr, rman_res_t *irq) { struct bcm_mips_intr_map_data *imd; u_int ivec; uintptr_t xref; int error; /* Fetch the backplane interrupt vector */ if ((error = bhnd_get_intr_ivec(child, intr, &ivec))) { device_printf(dev, "error fetching ivec for intr %u: %d\n", intr, error); return (error); } /* Determine our interrupt domain */ xref = BHND_BUS_GET_INTR_DOMAIN(dev, child, false); KASSERT(xref != 0, ("missing interrupt domain")); /* Allocate our map data */ imd = (struct bcm_mips_intr_map_data *)intr_alloc_map_data( INTR_MAP_DATA_BCM_MIPS, sizeof(*imd), M_WAITOK | M_ZERO); imd->ivec = ivec; /* Map the IRQ */ *irq = intr_map_irq(NULL, xref, &imd->mdata); return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_UNMAP_INTR(). */ static void bhnd_nexus_unmap_intr(device_t dev, device_t child, rman_res_t irq) { if (irq > UINT_MAX) panic("invalid irq: %ju", (uintmax_t)irq); intr_unmap_irq(irq); } /** * Default bhnd_nexus implementation of BHND_BUS_GET_DMA_TRANSLATION(). */ static int bhnd_nexus_get_dma_translation(device_t dev, device_t child, u_int width, uint32_t flags, bus_dma_tag_t *dmat, struct bhnd_dma_translation *translation) { struct bcm_platform *bp = bcm_get_platform(); /* We don't (currently) support any flags */ if (flags != 0x0) return (ENOENT); KASSERT(width > 0 && width <= BHND_DMA_ADDR_64BIT, ("invalid width %u", width)); /* Is the requested width supported? */ if (width > BHND_DMA_ADDR_32BIT) { /* Backplane must support 64-bit addressing */ if (!(bp->cid.chip_caps & BHND_CAP_BP64)) width = BHND_DMA_ADDR_32BIT; } /* No DMA address translation required */ if (dmat != NULL) *dmat = bus_get_dma_tag(dev); if (translation != NULL) { *translation = (struct bhnd_dma_translation) { .base_addr = 0x0, .addr_mask = BHND_DMA_ADDR_BITMASK(width), .addrext_mask = 0 }; } return (0); } static device_method_t bhnd_nexus_methods[] = { /* bhnd interface */ DEVMETHOD(bhnd_bus_get_service_registry,bhnd_nexus_get_service_registry), DEVMETHOD(bhnd_bus_register_provider, bhnd_bus_generic_sr_register_provider), DEVMETHOD(bhnd_bus_deregister_provider, bhnd_bus_generic_sr_deregister_provider), DEVMETHOD(bhnd_bus_retain_provider, bhnd_bus_generic_sr_retain_provider), DEVMETHOD(bhnd_bus_release_provider, bhnd_bus_generic_sr_release_provider), DEVMETHOD(bhnd_bus_activate_resource, bhnd_nexus_activate_resource), DEVMETHOD(bhnd_bus_deactivate_resource, bhnd_nexus_deactivate_resource), DEVMETHOD(bhnd_bus_is_hw_disabled, bhnd_nexus_is_hw_disabled), DEVMETHOD(bhnd_bus_get_attach_type, bhnd_nexus_get_attach_type), DEVMETHOD(bhnd_bus_get_chipid, bhnd_nexus_get_chipid), DEVMETHOD(bhnd_bus_get_dma_translation, bhnd_nexus_get_dma_translation), DEVMETHOD(bhnd_bus_get_intr_domain, bhnd_bus_generic_get_intr_domain), DEVMETHOD(bhnd_bus_map_intr, bhnd_nexus_map_intr), DEVMETHOD(bhnd_bus_read_board_info, bhnd_nexus_read_board_info), DEVMETHOD(bhnd_bus_unmap_intr, bhnd_nexus_unmap_intr), DEVMETHOD_END }; DEFINE_CLASS_0(bhnd, bhnd_nexus_driver, bhnd_nexus_methods, sizeof(struct bhnd_softc)); Index: projects/fuse2/sys/netinet/cc/cc_dctcp.c =================================================================== --- projects/fuse2/sys/netinet/cc/cc_dctcp.c (revision 350434) +++ projects/fuse2/sys/netinet/cc/cc_dctcp.c (revision 350435) @@ -1,469 +1,471 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2014 Midori Kato * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the DCTCP algorithm for FreeBSD, based on * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz, * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan., * in ACM Conference on SIGCOMM 2010, New York, USA, * Originally released as the contribution of Microsoft Research project. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#define MAX_ALPHA_VALUE 1024 -VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = 0; +#define DCTCP_SHIFT 10 +#define MAX_ALPHA_VALUE (1<cc_data; if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { /* * DCTCP doesn't treat receipt of ECN marked packet as a * congestion event. Thus, DCTCP always executes the ACK * processing out of congestion recovery. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags))) { EXIT_CONGRECOVERY(CCV(ccv, t_flags)); newreno_cc_algo.ack_received(ccv, type); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } else newreno_cc_algo.ack_received(ccv, type); if (type == CC_DUPACK) - bytes_acked = CCV(ccv, t_maxseg); + bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); if (type == CC_ACK) bytes_acked = ccv->bytes_this_ack; /* Update total bytes. */ dctcp_data->bytes_total += bytes_acked; /* Update total marked bytes. */ if (dctcp_data->ece_curr) { + //XXRMS: For fluid-model DCTCP, update + //cwnd here during for RTT fairness if (!dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) { dctcp_data->bytes_ecn += (bytes_acked - CCV(ccv, t_maxseg)); } else dctcp_data->bytes_ecn += bytes_acked; dctcp_data->ece_prev = 1; } else { if (dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) dctcp_data->bytes_ecn += CCV(ccv, t_maxseg); dctcp_data->ece_prev = 0; } dctcp_data->ece_curr = 0; /* * Update the fraction of marked bytes at the end of * current window size. */ if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) || (!IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))) dctcp_update_alpha(ccv); } else newreno_cc_algo.ack_received(ccv, type); } static void dctcp_after_idle(struct cc_var *ccv) { struct dctcp *dctcp_data; - dctcp_data = ccv->cc_data; + if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { + dctcp_data = ccv->cc_data; - /* Initialize internal parameters after idle time */ - dctcp_data->bytes_ecn = 0; - dctcp_data->bytes_total = 0; - dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); - dctcp_data->alpha = V_dctcp_alpha; - dctcp_data->ece_curr = 0; - dctcp_data->ece_prev = 0; - dctcp_data->num_cong_events = 0; + /* Initialize internal parameters after idle time */ + dctcp_data->bytes_ecn = 0; + dctcp_data->bytes_total = 0; + dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); + dctcp_data->alpha = V_dctcp_alpha; + dctcp_data->ece_curr = 0; + dctcp_data->ece_prev = 0; + dctcp_data->num_cong_events = 0; + } - dctcp_cc_algo.after_idle = newreno_cc_algo.after_idle; + newreno_cc_algo.after_idle(ccv); } static void dctcp_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_dctcp); } static int dctcp_cb_init(struct cc_var *ccv) { struct dctcp *dctcp_data; dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO); if (dctcp_data == NULL) return (ENOMEM); /* Initialize some key variables with sensible defaults. */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; /* * When alpha is set to 0 in the beginning, DCTCP sender transfers as * much data as possible until the value converges which may expand the * queueing delay at the switch. When alpha is set to 1, queueing delay * is kept small. * Throughput-sensitive applications should have alpha = 0 * Latency-sensitive applications should have alpha = 1 * * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to * keep it 0 as default. */ dctcp_data->alpha = V_dctcp_alpha; dctcp_data->save_sndnxt = 0; dctcp_data->ce_prev = 0; dctcp_data->ece_curr = 0; dctcp_data->ece_prev = 0; dctcp_data->num_cong_events = 0; ccv->cc_data = dctcp_data; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void dctcp_cong_signal(struct cc_var *ccv, uint32_t type) { struct dctcp *dctcp_data; - u_int win, mss; + u_int cwin, mss; - dctcp_data = ccv->cc_data; - win = CCV(ccv, snd_cwnd); - mss = CCV(ccv, t_maxseg); + if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { + dctcp_data = ccv->cc_data; + cwin = CCV(ccv, snd_cwnd); + mss = CCV(ccv, t_maxseg); - switch (type) { - case CC_NDUPACK: - if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { + if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { + CCV(ccv, snd_ssthresh) = + max(cwin / 2, 2 * mss); + dctcp_data->num_cong_events++; + } else { + /* cwnd has already updated as congestion + * recovery. Reverse cwnd value using + * snd_cwnd_prev and recalculate snd_ssthresh + */ + cwin = CCV(ccv, snd_cwnd_prev); + CCV(ccv, snd_ssthresh) = + max(cwin / 2, 2 * mss); + } + ENTER_RECOVERY(CCV(ccv, t_flags)); + } + break; + case CC_ECN: + /* + * Save current snd_cwnd when the host encounters both + * congestion recovery and fast recovery. + */ + CCV(ccv, snd_cwnd_prev) = cwin; if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - CCV(ccv, snd_ssthresh) = mss * - max(win / 2 / mss, 2); - dctcp_data->num_cong_events++; - } else { - /* cwnd has already updated as congestion - * recovery. Reverse cwnd value using - * snd_cwnd_prev and recalculate snd_ssthresh - */ - win = CCV(ccv, snd_cwnd_prev); - CCV(ccv, snd_ssthresh) = - max(win / 2 / mss, 2) * mss; + if (V_dctcp_slowstart && + dctcp_data->num_cong_events++ == 0) { + CCV(ccv, snd_ssthresh) = + max(cwin / 2, 2 * mss); + dctcp_data->alpha = MAX_ALPHA_VALUE; + dctcp_data->bytes_ecn = 0; + dctcp_data->bytes_total = 0; + dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); + } else + CCV(ccv, snd_ssthresh) = + max((cwin - (((uint64_t)cwin * + dctcp_data->alpha) >> (DCTCP_SHIFT+1))), + 2 * mss); + CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); + ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } - ENTER_RECOVERY(CCV(ccv, t_flags)); - } - break; - case CC_ECN: - /* - * Save current snd_cwnd when the host encounters both - * congestion recovery and fast recovery. - */ - CCV(ccv, snd_cwnd_prev) = win; - if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { - if (V_dctcp_slowstart && - dctcp_data->num_cong_events++ == 0) { - CCV(ccv, snd_ssthresh) = - mss * max(win / 2 / mss, 2); - dctcp_data->alpha = MAX_ALPHA_VALUE; - dctcp_data->bytes_ecn = 0; - dctcp_data->bytes_total = 0; - dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); - } else - CCV(ccv, snd_ssthresh) = max((win - ((win * - dctcp_data->alpha) >> 11)) / mss, 2) * mss; - CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); - ENTER_CONGRECOVERY(CCV(ccv, t_flags)); - } - dctcp_data->ece_curr = 1; - break; - case CC_RTO: - if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { + dctcp_data->ece_curr = 1; + break; + case CC_RTO: CCV(ccv, t_flags) |= TF_ECN_SND_CWR; dctcp_update_alpha(ccv); dctcp_data->save_sndnxt += CCV(ccv, t_maxseg); dctcp_data->num_cong_events++; + break; } - break; - } + } else + newreno_cc_algo.cong_signal(ccv, type); } static void dctcp_conn_init(struct cc_var *ccv) { struct dctcp *dctcp_data; dctcp_data = ccv->cc_data; if (CCV(ccv, t_flags) & TF_ECN_PERMIT) dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void dctcp_post_recovery(struct cc_var *ccv) { - dctcp_cc_algo.post_recovery = newreno_cc_algo.post_recovery; + newreno_cc_algo.post_recovery(ccv); if (CCV(ccv, t_flags) & TF_ECN_PERMIT) dctcp_update_alpha(ccv); } /* * Execute an additional ECN processing using ECN field in IP header and the CWR * bit in TCP header. * * delay_ack == 0 - Delayed ACK disabled * delay_ack == 1 - Delayed ACK enabled */ static void dctcp_ecnpkt_handler(struct cc_var *ccv) { struct dctcp *dctcp_data; uint32_t ccflag; int delay_ack; dctcp_data = ccv->cc_data; ccflag = ccv->flags; delay_ack = 1; /* - * DCTCP responses an ACK immediately when the CE state - * in between this segment and the last segment is not same. + * DCTCP responds with an ACK immediately when the CE state + * in between this segment and the last segment has changed. */ if (ccflag & CCF_IPHDR_CE) { if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK)) delay_ack = 0; dctcp_data->ce_prev = 1; CCV(ccv, t_flags) |= TF_ECN_SND_ECE; } else { if (dctcp_data->ce_prev && (ccflag & CCF_DELACK)) delay_ack = 0; dctcp_data->ce_prev = 0; CCV(ccv, t_flags) &= ~TF_ECN_SND_ECE; } /* DCTCP sets delayed ack when this segment sets the CWR flag. */ if ((ccflag & CCF_DELACK) && (ccflag & CCF_TCPHDR_CWR)) delay_ack = 1; if (delay_ack == 0) ccv->flags |= CCF_ACKNOW; - else - ccv->flags &= ~CCF_ACKNOW; } /* * Update the fraction of marked bytes represented as 'alpha'. * Also initialize several internal parameters at the end of this function. */ static void dctcp_update_alpha(struct cc_var *ccv) { struct dctcp *dctcp_data; int alpha_prev; dctcp_data = ccv->cc_data; alpha_prev = dctcp_data->alpha; dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1); /* - * Update alpha: alpha = (1 - g) * alpha + g * F. + * Update alpha: alpha = (1 - g) * alpha + g * M. * Here: * g is weight factor * recommaded to be set to 1/16 * small g = slow convergence between competitive DCTCP flows * large g = impacts low utilization of bandwidth at switches - * F is fraction of marked segments in last RTT + * M is fraction of marked segments in last RTT * updated every RTT * Alpha must be round to 0 - MAX_ALPHA_VALUE. */ - dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) + - (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) / + dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) + + ((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - V_dctcp_shift_g)) / dctcp_data->bytes_total, MAX_ALPHA_VALUE); /* Initialize internal parameters for next alpha calculation */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); } static int dctcp_alpha_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_alpha; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { - if (new > 1) + if (new > MAX_ALPHA_VALUE) error = EINVAL; - else { - if (new > MAX_ALPHA_VALUE) - V_dctcp_alpha = MAX_ALPHA_VALUE; - else - V_dctcp_alpha = new; - } + else + V_dctcp_alpha = new; } return (error); } static int dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_shift_g; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { - if (new > 1) + if (new > DCTCP_SHIFT) error = EINVAL; else V_dctcp_shift_g = new; } return (error); } static int dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_slowstart; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > 1) error = EINVAL; else V_dctcp_slowstart = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_dctcp); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, NULL, "dctcp congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha, CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0, &dctcp_alpha_handler, - "IU", "dctcp alpha parameter"); + "IU", "dctcp alpha parameter at start of session"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g, CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4, &dctcp_shift_g_handler, "IU", "dctcp shift parameter"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart, CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_slowstart), 0, &dctcp_slowstart_handler, "IU", "half CWND reduction after the first slow start"); DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo); Index: projects/fuse2/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- projects/fuse2/sys/netpfil/ipfw/ip_fw2.c (revision 350434) +++ projects/fuse2/sys/netpfil/ipfw/ip_fw2.c (revision 350435) @@ -1,3529 +1,3536 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include /* for struct grehdr */ #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; #ifndef LINEAR_SKIPTO static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) #else static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) #endif /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_parse(struct tcphdr *tcp, uint16_t *mss) { u_char *cp = (u_char *)(tcp + 1); int optlen, bits = 0; int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; if (mss != NULL) *mss = be16dec(cp + 2); break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (bits); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { return (flags_match(cmd, tcpopts_parse(tcp, NULL))); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct nhop4_basic nh4; if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0) return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ifp != nh4.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; #endif /* __FreeBSD__ */ } /* * Generate an SCTP packet containing an ABORT chunk. The verification tag * is given by vtag. The T-bit is set in the ABORT chunk if and only if * reflected is not 0. */ static struct mbuf * ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag, int reflected) { struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct sctphdr *sctp; struct sctp_chunkhdr *chunk; u_int16_t hlen, plen, tlen; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: hlen = sizeof(struct ip); break; #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); tlen = hlen + plen; m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = tlen; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, tlen); switch (id->addr_type) { case 4: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(tlen); ip->ip_id = htons(0); ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_SCTP; ip->ip_sum = 0; ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); sctp = (struct sctphdr *)(ip + 1); break; #ifdef INET6 case 6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(plen); ip6->ip6_nxt = IPPROTO_SCTP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = id->dst_ip6; ip6->ip6_dst = id->src_ip6; sctp = (struct sctphdr *)(ip6 + 1); break; #endif } sctp->src_port = htons(id->dst_port); sctp->dest_port = htons(id->src_port); sctp->v_tag = htonl(vtag); sctp->checksum = htonl(0); chunk = (struct sctp_chunkhdr *)(sctp + 1); chunk->chunk_type = SCTP_ABORT_ASSOCIATION; chunk->chunk_flags = 0; if (reflected != 0) { chunk->chunk_flags |= SCTP_HAD_NO_TCB; } chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr)); sctp->checksum = sctp_calculate_cksum(m, hlen); return (m); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; int len, dir; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static const struct in6_addr lla_mask = {{{ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}; static int ipfw_localip6(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_MULTICAST(in6)) return (0); if (!IN6_IS_ADDR_LINKLOCAL(in6)) return (in6_localip(in6)); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, in6, &lla_mask)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct nhop6_basic nh6; if (IN6_IS_SCOPE_LINKLOCAL(src)) return (1); if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0) return (0); /* If ifp is provided, check for equality with route table. */ if (ifp != NULL && ifp != nh6.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static int map_icmp_unreach(int code) { /* RFC 7915 p4.2 */ switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: return (ICMP6_DST_UNREACH_NOROUTE); case ICMP_UNREACH_PORT: return (ICMP6_DST_UNREACH_NOPORT); default: /* * Map the rest of codes into admit prohibited. * XXX: unreach proto should be mapped into ICMPv6 * parameter problem, but we use only unreach type. */ return (ICMP6_DST_UNREACH_ADMIN); } } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code == ICMP6_UNREACH_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m0; struct sctphdr *sctp; u_int32_t v_tag; int reflected; sctp = (struct sctphdr *)((char *)ip6 + hlen); reflected = 1; v_tag = ntohl(sctp->v_tag); /* Investigate the first chunk header if available */ if (m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { struct sctp_chunkhdr *chunk; chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (m->m_pkthdr.len > hlen + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { struct sctp_init *init; init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m0 = NULL; } else { m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else if (code == ICMP_REJECT_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m; struct sctphdr *sctp; struct sctp_chunkhdr *chunk; struct sctp_init *init; u_int32_t v_tag; int reflected; sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *)); reflected = 1; v_tag = ntohl(sctp->v_tag); if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { /* Look at the first chunk header if available */ chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (iplen > (ip->ip_hl << 2) + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m = NULL; } else { m = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; int lookupflags; int match; id = &args->f_id; inp = args->inp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else if (id->proto == IPPROTO_UDPLITE) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_ulitecbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (args->flags & IPFW_ARGS_IN) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, args->ifp, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (args->flags & IPFW_ARGS_IN) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, args->ifp, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; args->flags |= IPFW_ARGS_REF; } #ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->cached_pos), * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TARG && f->cached_id == chain->id) f_pos = f->cached_pos; else { int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; if (chain->idxmap != NULL) f_pos = chain->idxmap[i]; else f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TARG) { f->cached_id = chain->id; f->cached_pos = f_pos; } } return (f_pos); } #else /* * Helper function to enable real fast rule lookups. */ static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; num = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && num <= f->rulenum) num = f->rulenum + 1; f_pos = chain->idxmap[num]; return (f_pos); } #endif #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->ifp Incoming or outgoing interface. * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * OR * args->mem Pointer to contigous memory chunk. * ip Is the beginning of the ip(4 or 6) header. * eh Ethernet header in case if input is Layer2. */ struct mbuf *m; struct ip *ip; struct ether_header *eh; /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; int f_pos = 0; /* index of current rule in the array */ int retval = 0; struct ifnet *oif, *iif; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port, dst_port; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ int iplen = 0; int pktlen; struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; bool mem; if ((mem = (args->flags & IPFW_ARGS_LENMASK))) { if (args->flags & IPFW_ARGS_ETHER) { eh = (struct ether_header *)args->mem; if (eh->ether_type == htons(ETHERTYPE_VLAN)) ip = (struct ip *) ((struct ether_vlan_header *)eh + 1); else ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = (struct ip *)args->mem; } pktlen = IPFW_ARGS_LENGTH(args->flags); args->f_id.fib = args->ifp->if_fib; /* best guess */ } else { m = args->m; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ if (args->flags & IPFW_ARGS_ETHER) { /* We need some amount of data to be contiguous. */ if (m->m_len < min(m->m_pkthdr.len, max_protohdr) && (args->m = m = m_pullup(m, min(m->m_pkthdr.len, max_protohdr))) == NULL) goto pullup_failed; eh = mtod(m, struct ether_header *); ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = mtod(m, struct ip *); } pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* mbuf not altered */ } dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ src_port = dst_port = 0; DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ -#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) +#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define EHLEN (eh != NULL ? ((char *)ip - (char *)eh) : 0) -#define PULLUP_LEN(_len, p, T) \ +#define _PULLUP_LOCKED(_len, p, T, unlock) \ do { \ int x = (_len) + T + EHLEN; \ if (mem) { \ MPASS(pktlen >= x); \ p = (char *)args->mem + (_len) + EHLEN; \ } else { \ if (__predict_false((m)->m_len < x)) { \ args->m = m = m_pullup(m, x); \ - if (m == NULL) \ + if (m == NULL) { \ + unlock; \ goto pullup_failed; \ + } \ } \ p = mtod(m, char *) + (_len) + EHLEN; \ } \ } while (0) + +#define PULLUP_LEN(_len, p, T) _PULLUP_LOCKED(_len, p, T, ) +#define PULLUP_LEN_LOCKED(_len, p, T) \ + _PULLUP_LOCKED(_len, p, T, IPFW_PF_RUNLOCK(chain)) /* * In case pointers got stale after pullups, update them. */ #define UPDATE_POINTERS() \ do { \ if (!mem) { \ if (eh != NULL) { \ eh = mtod(m, struct ether_header *); \ ip = (struct ip *)(eh + 1); \ } else \ ip = mtod(m, struct ip *); \ args->m = m; \ } \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->flags |= IPFW_ARGS_IP6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_GRE: /* RFC 1701 */ /* XXX GRE header check? */ PULLUP_TO(hlen, ulp, struct grehdr); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, offsetof( struct carp_header, carp_counter)); if (CARP_ADVERTISEMENT != ((struct carp_header *)ulp)->carp_type) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } UPDATE_POINTERS(); ip6 = (struct ip6_hdr *)ip; args->f_id.addr_type = 6; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6); } else if (pktlen >= sizeof(struct ip) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) && ip->ip_v == 4) { is_ipv4 = 1; args->flags |= IPFW_ARGS_IP4; hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster * matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } else { if (offset == 1 && proto == IPPROTO_TCP) { /* RFC 3128 */ goto pullup_failed; } } UPDATE_POINTERS(); args->f_id.addr_type = 4; args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } else { proto = 0; dst_ip.s_addr = src_ip.s_addr = 0; args->f_id.addr_type = 1; /* XXX */ } #undef PULLUP_TO pktlen = iplen < pktlen ? iplen: pktlen; /* Properly initialize the rest of f_id */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->flags & IPFW_ARGS_REF) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } if (args->flags & IPFW_ARGS_IN) { iif = args->ifp; oif = NULL; } else { MPASS(args->flags & IPFW_ARGS_OUT); iif = mem ? NULL : m_rcvif(m); oif = args->ifp; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(iif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(args->ifp, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->flags & IPFW_ARGS_ETHER) { u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->flags & IPFW_ARGS_ETHER) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (ntohs(eh->ether_type) >= p[0] && ntohs(eh->ether_type) <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->flags & IPFW_ARGS_ETHER); break; case O_DIVERTED: if ((args->flags & IPFW_ARGS_REF) == 0) break; /* * For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ match = ((args->rule.info & IPFW_IS_MASK) == IPFW_IS_DIVERT) && ( ((args->rule.info & IPFW_INFO_IN) ? 1: 2) & cmd->arg1); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_DST_LOOKUP: { void *pkey; uint32_t vidx, key; uint16_t keylen; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* Determine lookup key type */ vidx = ((ipfw_insn_u32 *)cmd)->d[1]; if (vidx != 4 /* uid */ && vidx != 5 /* jail */ && is_ipv6 == 0 && is_ipv4 == 0) break; /* Determine key length */ if (vidx == 0 /* dst-ip */ || vidx == 1 /* src-ip */) keylen = is_ipv6 ? sizeof(struct in6_addr): sizeof(in_addr_t); else { keylen = sizeof(key); pkey = &key; } if (vidx == 0 /* dst-ip */) pkey = is_ipv4 ? (void *)&dst_ip: (void *)&args->f_id.dst_ip6; else if (vidx == 1 /* src-ip */) pkey = is_ipv4 ? (void *)&src_ip: (void *)&args->f_id.src_ip6; else if (vidx == 6 /* dscp */) { if (is_ipv4) key = ip->ip_tos >> 2; else { key = args->f_id.flow_id6; key = (key & 0x0f) << 2 | (key & 0xf000) >> 14; } key &= 0x3f; } else if (vidx == 2 /* dst-port */ || vidx == 3 /* src-port */) { /* Skip fragments */ if (offset != 0) break; /* Skip proto without ports */ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && proto != IPPROTO_SCTP) break; if (vidx == 2 /* dst-port */) key = dst_port; else key = src_port; } #ifndef USERSPACE else if (vidx == 4 /* uid */ || vidx == 5 /* jail */) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache->cr_uid; else if (vidx == 5 /* jail */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache.uid; else if (vidx == 5 /* jail */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ } #endif /* !USERSPACE */ else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; tablearg = vidx; break; } /* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */ /* FALLTHROUGH */ } case O_IP_SRC_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen; if (is_ipv4) { keylen = sizeof(in_addr_t); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &dst_ip; else pkey = &src_ip; } else if (is_ipv6) { keylen = sizeof(struct in6_addr); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &args->f_id.dst_ip6; else pkey = &args->f_id.src_ip6; } else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, vidx, tag); if (!match) break; } tablearg = vidx; break; } case O_IP_FLOW_LOOKUP: { uint32_t v = 0; match = ipfw_lookup_table(chain, cmd->arg1, 0, &args->f_id, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, v, tag); if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { match = in_localip(src_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { match = in_localip(dst_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_TCP || proto == IPPROTO_SCTP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPTTL: if (!is_ipv4) break; case O_IPLEN: { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; #ifdef INET6 if (is_ipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip; if (ip6->ip6_plen == 0) { /* * Jumbo payload is not * supported by this * opcode. */ break; } x = iplen - hlen; } else #endif /* INET6 */ x = iplen - (ip->ip_hl << 2); tcp = TCP(ulp); x -= tcp->th_off << 2; if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ - PULLUP_LEN(hlen, ulp, + PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPMSS: if (proto == IPPROTO_TCP && (args->f_id._flags & TH_SYN) != 0 && ulp != NULL) { uint16_t mss, *p; int i; - PULLUP_LEN(hlen, ulp, + PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); if ((tcpopts_parse(TCP(ulp), &mss) & IP_FW_TCPOPT_MSS) == 0) break; if (cmdlen == 1) { match = (cmd->arg1 == mss); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (mss >= p[0] && mss <= p[1]); } break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = (args->flags & IPFW_ARGS_OUT || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib)))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib); else match = 1; break; case O_IPSEC: match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else if (proto == IPPROTO_UDPLITE) pi = &V_ulitecbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incoming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule (one * exception is O_SKIP_ACTION which could be * between these opcodes and 'action' one). * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. * * O_SKIP_ACTION: this opcode is not a real 'action' * either, and is stored right before the 'action' * part of the rule, right after the O_KEEP_STATE * opcode. It causes match failure so the real * 'action' could be executed only if the rule * is checked via dynamic rule from the state * table, as in such case execution starts * from the true 'action' opcode directly. * */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, (ipfw_insn_limit *)cmd, args, ulp, pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && (q = ipfw_dyn_lookup_state(args, ulp, pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule * by setting f, cmd, l and clearing * cmdlen. */ f = q; f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_SKIP_ACTION: match = 0; /* skip to the next rule */ l = 0; /* exit inner loop */ break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->flags & IPFW_ARGS_ETHER) break; /* not on layer 2 */ /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST( &args->f_id.dst_ip6)) { send_reject6(args, cmd->opcode == O_REJECT ? map_icmp_unreach(cmd->arg1): cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { #ifdef INET6 /* * We use O_FORWARD_IP opcode for * fwd rule with tablearg, but tables * now support IPv6 addresses. And * when we are inspecting IPv6 packet, * we can use nh6 field from * table_value as next_hop6 address. */ if (is_ipv6) { struct ip_fw_nh6 *nh6; args->flags |= IPFW_ARGS_NH6; nh6 = &args->hopstore6; nh6->sin6_addr = TARG_VAL( chain, tablearg, nh6); nh6->sin6_port = sa->sin_port; nh6->sin6_scope_id = TARG_VAL( chain, tablearg, zoneid); } else #endif { args->flags |= IPFW_ARGS_NH4; args->hopstore.sin_port = sa->sin_port; sa = &args->hopstore; sa->sin_family = AF_INET; sa->sin_len = sizeof(*sa); sa->sin_addr.s_addr = htonl( TARG_VAL(chain, tablearg, nh4)); } } else { args->flags |= IPFW_ARGS_NH4PTR; args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->flags |= IPFW_ARGS_NH6PTR; args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; /* XXX */ l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t old; old = *(uint16_t *)ip; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ /* * Ensure that we do not invoke NAT handler for * non IPv4 packets. Libalias expects only IPv4. */ if (!is_ipv4 || !IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; args->rule.info = 0; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; l = 0; /* in any case exit inner loop */ if (is_ipv6) /* IPv6 is not supported yet */ break; IPFW_INC_RULE_COUNTER(f, pktlen); ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; args->rule.info = 0; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } case O_EXTERNAL_ACTION: l = 0; /* in any case exit inner loop */ retval = ipfw_run_eaction(chain, args, cmd, &done); /* * If both @retval and @done are zero, * consider this as rule matching and * update counters. */ if (retval == 0 && done == 0) { IPFW_INC_RULE_COUNTER(f, pktlen); /* * Reset the result of the last * dynamic state lookup. * External action can change * @args content, and it may be * used for new state lookup later. */ DYN_INFO_INIT(&dyn_info); } break; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN +#undef PULLUP_LEN_LOCKED if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_destroy_sopt_handler(); ipfw_destroy_obj_rewriter(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* Init shared services hash table */ ipfw_init_srv(chain); ipfw_init_counters(); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } IPFW_LOCK_INIT(chain); /* fill and insert the default rule */ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); rule->flags |= IPFW_RULE_NOOPT; rule->cmd_len = 1; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = rule; ipfw_add_protected_rule(chain, rule, 0); ipfw_dyn_init(chain); ipfw_eaction_init(chain, first); #ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); #endif ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ ipfw_detach_hooks(); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_UH_WLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); #ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); #endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); ipfw_bpf_uninit(last); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c =================================================================== --- projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c (revision 350434) +++ projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c (revision 350435) @@ -1,457 +1,455 @@ /*- * Copyright (c) 2016-2017 Yandex LLC * Copyright (c) 2016-2017 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include /* struct ipfw_rule_ref */ #include #include #include "opt_ipfw.h" /* * External actions support for ipfw. * * This code provides KPI for implementing loadable modules, that * can provide handlers for external action opcodes in the ipfw's * rules. * Module should implement opcode handler with type ipfw_eaction_t. * This handler will be called by ipfw_chk() function when * O_EXTERNAL_ACTION opcode is matched. The handler must return * value used as return value in ipfw_chk(), i.e. IP_FW_PASS, * IP_FW_DENY (see ip_fw_private.h). * Also the last argument must be set by handler. If it is zero, * the search continues to the next rule. If it has non zero value, * the search terminates. * * The module that implements external action should register its * handler and name with ipfw_add_eaction() function. * This function will return eaction_id, that can be used by module. * * It is possible to pass some additional information to external * action handler using O_EXTERNAL_INSTANCE and O_EXTERNAL_DATA opcodes. * Such opcodes should be next after the O_EXTERNAL_ACTION opcode. * For the O_EXTERNAL_INSTANCE opcode the cmd->arg1 contains index of named * object related to an instance of external action. * For the O_EXTERNAL_DATA opcode the cmd contains the data that can be used * by external action handler without needing to create named instance. * * In case when eaction module uses named instances, it should register * opcode rewriting routines for O_EXTERNAL_INSTANCE opcode. The * classifier callback can look back into O_EXTERNAL_ACTION opcode (it * must be in the (ipfw_insn *)(cmd - 1)). By arg1 from O_EXTERNAL_ACTION * it can deteremine eaction_id and compare it with its own. * The macro IPFW_TLV_EACTION_NAME(eaction_id) can be used to deteremine * the type of named_object related to external action instance. * * On module unload handler should be deregistered with ipfw_del_eaction() * function using known eaction_id. */ struct eaction_obj { struct named_object no; ipfw_eaction_t *handler; char name[64]; }; #define EACTION_OBJ(ch, cmd) \ ((struct eaction_obj *)SRV_OBJECT((ch), (cmd)->arg1)) #if 0 #define EACTION_DEBUG(fmt, ...) do { \ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ } while (0) #else #define EACTION_DEBUG(fmt, ...) #endif const char *default_eaction_typename = "drop"; static int default_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { *done = 1; /* terminate the search */ return (IP_FW_DENY); } /* * Opcode rewriting callbacks. */ static int eaction_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { EACTION_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); *puidx = cmd->arg1; *ptype = 0; return (0); } static void eaction_update(ipfw_insn *cmd, uint16_t idx) { cmd->arg1 = idx; EACTION_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1); } static int eaction_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { ipfw_obj_ntlv *ntlv; if (ti->tlvs == NULL) return (EINVAL); /* Search ntlv in the buffer provided by user */ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_EACTION); if (ntlv == NULL) return (EINVAL); EACTION_DEBUG("name %s, uidx %u, type %u", ntlv->name, ti->uidx, ti->type); /* * Search named object with corresponding name. * Since eaction objects are global - ignore the set value * and use zero instead. */ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_EACTION, ntlv->name); if (*pno == NULL) return (ESRCH); return (0); } static struct named_object * eaction_findbykidx(struct ip_fw_chain *ch, uint16_t idx) { EACTION_DEBUG("kidx %u", idx); return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); } static struct opcode_obj_rewrite eaction_opcodes[] = { { .opcode = O_EXTERNAL_ACTION, .etlv = IPFW_TLV_EACTION, .classifier = eaction_classify, .update = eaction_update, .find_byname = eaction_findbyname, .find_bykidx = eaction_findbykidx, }, }; static int create_eaction_obj(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name, uint16_t *eaction_id) { struct namedobj_instance *ni; struct eaction_obj *obj; IPFW_UH_UNLOCK_ASSERT(ch); ni = CHAIN_TO_SRV(ch); obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); obj->no.name = obj->name; obj->no.etlv = IPFW_TLV_EACTION; obj->handler = handler; strlcpy(obj->name, name, sizeof(obj->name)); IPFW_UH_WLOCK(ch); if (ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, name) != NULL) { /* * Object is already created. * We don't allow eactions with the same name. */ IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); EACTION_DEBUG("External action with typename " "'%s' already exists", name); return (EEXIST); } if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); EACTION_DEBUG("alloc_idx failed"); return (ENOSPC); } ipfw_objhash_add(ni, &obj->no); IPFW_WLOCK(ch); SRV_OBJECT(ch, obj->no.kidx) = obj; IPFW_WUNLOCK(ch); obj->no.refcnt++; IPFW_UH_WUNLOCK(ch); if (eaction_id != NULL) *eaction_id = obj->no.kidx; return (0); } static void destroy_eaction_obj(struct ip_fw_chain *ch, struct named_object *no) { struct namedobj_instance *ni; struct eaction_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_SRV(ch); IPFW_WLOCK(ch); obj = SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; IPFW_WUNLOCK(ch); ipfw_objhash_del(ni, no); ipfw_objhash_free_idx(ni, no->kidx); free(obj, M_IPFW); } /* * Resets all eaction opcodes to default handlers. */ static void reset_eaction_rules(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t instance_id, bool reset_rules) { struct named_object *no; int i; IPFW_UH_WLOCK_ASSERT(ch); no = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_EACTION, default_eaction_typename); if (no == NULL) panic("Default external action handler is not found"); if (eaction_id == no->kidx) panic("Wrong eaction_id"); EACTION_DEBUG("Going to replace id %u with %u", eaction_id, no->kidx); IPFW_WLOCK(ch); /* * Reset eaction objects only if it is referenced by rules. * But always reset objects for orphaned dynamic states. */ if (reset_rules) { for (i = 0; i < ch->n_rules; i++) { /* * Refcount on the original object will be just * ignored on destroy. Refcount on default_eaction * will be decremented on rule deletion, thus we * need to reference default_eaction object. */ if (ipfw_reset_eaction(ch, ch->map[i], eaction_id, no->kidx, instance_id) != 0) no->refcnt++; } } /* * Reset eaction opcodes for orphaned dynamic states. * Since parent rules are already deleted, we don't need to * reference named object of default_eaction. */ ipfw_dyn_reset_eaction(ch, eaction_id, no->kidx, instance_id); IPFW_WUNLOCK(ch); } /* * Initialize external actions framework. * Create object with default eaction handler "drop". */ int ipfw_eaction_init(struct ip_fw_chain *ch, int first) { int error; error = create_eaction_obj(ch, default_eaction, default_eaction_typename, NULL); if (error != 0) return (error); IPFW_ADD_OBJ_REWRITER(first, eaction_opcodes); EACTION_DEBUG("External actions support initialized"); return (0); } void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last) { struct namedobj_instance *ni; struct named_object *no; ni = CHAIN_TO_SRV(ch); IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, default_eaction_typename); if (no != NULL) destroy_eaction_obj(ch, no); IPFW_UH_WUNLOCK(ch); IPFW_DEL_OBJ_REWRITER(last, eaction_opcodes); EACTION_DEBUG("External actions support uninitialized"); } /* * Registers external action handler to the global array. * On success it returns eaction id, otherwise - zero. */ uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name) { uint16_t eaction_id; eaction_id = 0; if (ipfw_check_object_name_generic(name) == 0) { create_eaction_obj(ch, handler, name, &eaction_id); EACTION_DEBUG("Registered external action '%s' with id %u", name, eaction_id); } return (eaction_id); } /* * Deregisters external action handler with id eaction_id. */ int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id) { struct named_object *no; IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id); if (no == NULL || no->etlv != IPFW_TLV_EACTION) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } reset_eaction_rules(ch, eaction_id, 0, (no->refcnt > 1)); EACTION_DEBUG("External action '%s' with id %u unregistered", no->name, eaction_id); destroy_eaction_obj(ch, no); IPFW_UH_WUNLOCK(ch); return (0); } int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id) { ipfw_insn *cmd, *icmd; - int l, cmdlen; + int l; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); - cmd = ACTION_PTR(rule); - l = rule->cmd_len - rule->act_ofs; - while (l > 0) { - cmdlen = F_LEN(cmd); - l -= cmdlen; - if (cmd->opcode == O_EXTERNAL_ACTION || l <= 0) - break; - cmd += cmdlen; - } /* * Return if there is not O_EXTERNAL_ACTION or its id is * different. */ + cmd = ipfw_get_action(rule); if (cmd->opcode != O_EXTERNAL_ACTION || cmd->arg1 != eaction_id) return (0); /* * If instance_id is specified, we need to truncate the * rule length. Check if there is O_EXTERNAL_INSTANCE opcode. + * + * NOTE: F_LEN(cmd) must be 1 for O_EXTERNAL_ACTION opcode, + * and rule length should be enough to keep O_EXTERNAL_INSTANCE + * opcode, thus we do check for l > 1. */ - if (instance_id != 0 && l > 0) { - MPASS(cmdlen == 1); + l = rule->cmd + rule->cmd_len - cmd; + if (instance_id != 0 && l > 1) { + MPASS(F_LEN(cmd) == 1); icmd = cmd + 1; if (icmd->opcode != O_EXTERNAL_INSTANCE || icmd->arg1 != instance_id) return (0); /* * Since named_object related to this instance will be * destroyed, truncate the chain of opcodes to remove * the rest of cmd chain just after O_EXTERNAL_ACTION * opcode. */ EACTION_DEBUG("truncate rule %d: len %u -> %u", - rule->rulenum, rule->cmd_len, rule->cmd_len - l); - rule->cmd_len -= l; + rule->rulenum, rule->cmd_len, + rule->cmd_len - F_LEN(icmd)); + rule->cmd_len -= F_LEN(icmd); MPASS(((uint32_t *)icmd - (uint32_t *)rule->cmd) == rule->cmd_len); } cmd->arg1 = default_id; /* Set to default id */ /* * Return 1 when reset successfully happened. */ return (1); } /* * This function should be called before external action instance is * destroyed. It will reset eaction_id to default_id for rules, where * eaction has instance with id == kidx. */ int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t kidx) { struct named_object *no; IPFW_UH_WLOCK_ASSERT(ch); no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id); if (no == NULL || no->etlv != IPFW_TLV_EACTION) return (EINVAL); reset_eaction_rules(ch, eaction_id, kidx, 0); return (0); } int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { return (EACTION_OBJ(ch, cmd)->handler(ch, args, cmd, done)); } Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c =================================================================== --- projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c (revision 350434) +++ projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c (revision 350435) @@ -1,1243 +1,1242 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Paolo Pisati * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for in_cksum */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; uint16_t port; }; /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ uint16_t mode; /* type of redirect mode */ uint16_t proto; /* protocol: tcp/udp */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ uint16_t lport; /* local port */ uint16_t pport; /* public port */ uint16_t rport; /* remote port */ uint16_t pport_cnt; /* number of public ports */ uint16_t rport_cnt; /* number of remote ports */ struct alias_link **alink; u_int16_t spool_cnt; /* num of entry in spool chain */ /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ struct libalias *lib; /* libalias instance */ int mode; /* aliasing mode */ int redir_cnt; /* number of entry in spool chain */ /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; char if_name[IF_NAMESIZE]; /* interface name */ }; static eventhandler_tag ifaddr_event_tag; static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; struct ip_fw_chain *chain; KASSERT(curvnet == ifp->if_vnet, ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet)); if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0) return; chain = &V_layer3_chain; IPFW_UH_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) continue; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; IPFW_WLOCK(chain); ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); IPFW_WUNLOCK(chain); } if_addr_runlock(ifp); } IPFW_UH_WUNLOCK(chain); } /* * delete the pointers for nat entry ix, or all of them if ix < 0 */ static void flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { - int i; ipfw_insn_nat *cmd; + int i; IPFW_WLOCK_ASSERT(chain); for (i = 0; i < chain->n_rules; i++) { - cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); - /* XXX skip log and the like ? */ + cmd = (ipfw_insn_nat *)ipfw_get_action(chain->map[i]); if (cmd->o.opcode == O_NAT && cmd->nat != NULL && (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case NAT44_REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case NAT44_REDIR_ADDR: case NAT44_REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct nat44_cfg_redir *)&buf[off]; r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO); r->mode = ser_r->mode; r->laddr = ser_r->laddr; r->paddr = ser_r->paddr; r->raddr = ser_r->raddr; r->lport = ser_r->lport; r->pport = ser_r->pport; r->rport = ser_r->rport; r->pport_cnt = ser_r->pport_cnt; r->rport_cnt = ser_r->rport_cnt; r->proto = ser_r->proto; r->spool_cnt = ser_r->spool_cnt; //memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += sizeof(struct nat44_cfg_redir); r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case NAT44_REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case NAT44_REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case NAT44_REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } if (r->alink[0] == NULL) { printf("LibAliasRedirect* returned NULL\n"); free(r->alink, M_IPFW); free(r, M_IPFW); return (EINVAL); } /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct nat44_cfg_spool *)&buf[off]; s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO); s->addr = ser_s->addr; s->port = ser_s->port; LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += sizeof(struct nat44_cfg_spool); /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (0); } static void free_nat_instance(struct cfg_nat *ptr) { del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } /* * ipfw_nat - perform mbuf header translation. * * Note V_layer3_chain has to be locked while calling ipfw_nat() in * 'global' operation mode (t == NULL). * */ static int ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) { struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ int ldt, retval, found; struct ip_fw_chain *chain; char *c; ldt = 0; retval = 0; mcl = m_megapullup(m, m->m_pkthdr.len); if (mcl == NULL) { args->m = NULL; return (IP_FW_DENY); } ip = mtod(mcl, struct ip *); /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only pseudo-header checksum * calculated and libalias will break it[1], so mark them for * later fix. Moreover there are cases when libalias modifies * tcp packet data[2], mark them for later fix too. * * [1] libalias was never meant to run in kernel, so it does * not have any knowledge about checksum offloading, and * expects a packet with a full internet checksum. * Unfortunately, packets generated locally will have just the * pseudo header calculated, and when libalias tries to adjust * the checksum it will actually compute a wrong value. * * [2] when libalias modifies tcp's data content, full TCP * checksum has to be recomputed: the problem is that * libalias does not have any idea about checksum offloading. * To work around this, we do not do checksumming in LibAlias, * but only mark the packets in th_x2 field. If we receive a * marked packet, we calculate correct checksum for it * aware of offloading. Why such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will hide ALL * transmission errors. Yes, marked packets still suffer from * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); /* Check if this is 'global' instance */ if (t == NULL) { if (args->flags & IPFW_ARGS_IN) { /* Wrong direction, skip processing */ args->m = mcl; return (IP_FW_NAT); } found = 0; chain = &V_layer3_chain; IPFW_RLOCK_ASSERT(chain); /* Check every nat entry... */ LIST_FOREACH(t, &chain->nat, _next) { if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) continue; retval = LibAliasOutTry(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl), 0); if (retval == PKT_ALIAS_OK) { /* Nat instance recognises state */ found = 1; break; } } if (found != 1) { /* No instance found, return ignore */ args->m = mcl; return (IP_FW_NAT); } } else { if (args->flags & IPFW_ARGS_IN) retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); } /* * We drop packet when: * 1. libalias returns PKT_ALIAS_ERROR; * 2. For incoming packets: * a) for unresolved fragments; * b) libalias returns PKT_ALIAS_IGNORED and * PKT_ALIAS_DENY_INCOMING flag is set. */ if (retval == PKT_ALIAS_ERROR || ((args->flags & IPFW_ARGS_IN) && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (retval == PKT_ALIAS_IGNORED && (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } if (retval == PKT_ALIAS_RESPOND) mcl->m_flags |= M_SKIP_FIREWALL; mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (th->th_x2) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; uint16_t ip_len, cksum; ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ th->th_x2 = 0; th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* No hw checksum offloading: do it ourselves */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } } args->m = mcl; return (IP_FW_NAT); } static struct cfg_nat * lookup_nat(struct nat_list *l, int nat_id) { struct cfg_nat *res; LIST_FOREACH(res, l, _next) { if (res->id == nat_id) break; } return res; } static struct cfg_nat * lookup_nat_name(struct nat_list *l, char *name) { struct cfg_nat *res; int id; char *errptr; id = strtol(name, &errptr, 10); if (id == 0 || *errptr != '\0') return (NULL); LIST_FOREACH(res, l, _next) { if (res->id == id) break; } return (res); } /* IP_FW3 configuration routines */ static void nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) { struct cfg_nat *ptr, *tcfg; int gencnt; /* * Find/create nat rule. */ IPFW_UH_WLOCK(chain); gencnt = chain->gencnt; ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); ptr->lib = LibAliasInit(NULL); LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarily unhook it. */ IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Basic nat (re)configuration. */ ptr->id = strtol(ucfg->name, NULL, 10); /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ucfg->ip; ptr->redir_cnt = ucfg->redir_cnt; ptr->mode = ucfg->mode; strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name)); LibAliasSetMode(ptr->lib, ptr->mode, ~0); LibAliasSetAddress(ptr->lib, ptr->ip); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg((char *)(ucfg + 1), ptr); IPFW_UH_WLOCK(chain); /* Extra check to avoid race with another ipfw_nat_cfg() */ tcfg = NULL; if (gencnt != chain->gencnt) tcfg = lookup_nat_name(&chain->nat, ucfg->name); IPFW_WLOCK(chain); if (tcfg != NULL) LIST_REMOVE(tcfg, _next); LIST_INSERT_HEAD(&chain->nat, ptr, _next); IPFW_WUNLOCK(chain); chain->gencnt++; IPFW_UH_WUNLOCK(chain); if (tcfg != NULL) free_nat_instance(ptr); } /* * Creates/configure nat44 instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; int id; size_t read; char *errptr; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg))) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated and looks like number */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); id = strtol(ucfg->name, &errptr, 10); if (id == 0 || *errptr != '\0') return (EINVAL); read = sizeof(*oh) + sizeof(*ucfg); /* Check number of redirs */ if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir)) return (EINVAL); nat44_config(chain, ucfg); return (0); } /* * Destroys given nat instances. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct cfg_nat *ptr; ipfw_obj_ntlv *ntlv; /* Check minimum header size */ if (sd->valsize < sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ntlv = &oh->ntlv; /* Check if name is properly terminated */ if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name)) return (EINVAL); IPFW_UH_WLOCK(chain); ptr = lookup_nat_name(&chain->nat, ntlv->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (ESRCH); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static void export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg) { snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id); ucfg->ip = ptr->ip; ucfg->redir_cnt = ptr->redir_cnt; ucfg->mode = ptr->mode; strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name)); } /* * Gets config for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat); LIST_FOREACH(r, &ptr->redir_chain, _next) { sz += sizeof(struct nat44_cfg_redir); LIST_FOREACH(s, &r->spool_chain, _next) sz += sizeof(struct nat44_cfg_spool); } ucfg->size = sz; if (sd->valsize < sz) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } /* Size OK, let's copy data */ LIST_FOREACH(r, &ptr->redir_chain, _next) { ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd, sizeof(*ser_r)); ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space( sd, sizeof(*ser_s)); ser_s->addr = s->addr; ser_s->port = s->port; } } IPFW_UH_RUNLOCK(chain); return (0); } /* * Lists all nat44 instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ] * * Returns 0 on success */ static int nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; int nat_count; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(chain); nat_count = 0; LIST_FOREACH(ptr, &chain->nat, _next) nat_count++; olh->count = nat_count; olh->objsize = sizeof(struct nat44_cfg_nat); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } LIST_FOREACH(ptr, &chain->nat, _next) { ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd, sizeof(*ucfg)); export_nat_cfg(ptr, ucfg); } IPFW_UH_RUNLOCK(chain); return (0); } /* * Gets log for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat ] * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ] * * Returns 0 on success */ static int nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; void *pbuf; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } if (ptr->lib->logDesc == NULL) { IPFW_UH_RUNLOCK(chain); return (ENOENT); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE; if (sd->valsize < sz + sizeof(*oh)) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE); memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE); IPFW_UH_RUNLOCK(chain); return (0); } static struct ipfw_sopt_handler scodes[] = { { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg }, { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy }, { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg }, { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat }, { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log }, }; /* * Legacy configuration routines */ struct cfg_spool_legacy { LIST_ENTRY(cfg_spool_legacy) _next; struct in_addr addr; u_short port; }; struct cfg_redir_legacy { LIST_ENTRY(cfg_redir) _next; u_int16_t mode; struct in_addr laddr; struct in_addr paddr; struct in_addr raddr; u_short lport; u_short pport; u_short rport; u_short pport_cnt; u_short rport_cnt; int proto; struct alias_link **alink; u_int16_t spool_cnt; LIST_HEAD(, cfg_spool_legacy) spool_chain; }; struct cfg_nat_legacy { LIST_ENTRY(cfg_nat_legacy) _next; int id; struct in_addr ip; char if_name[IF_NAMESIZE]; int mode; struct libalias *lib; int redir_cnt; LIST_HEAD(, cfg_redir_legacy) redir_chain; }; static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat_legacy *cfg; struct nat44_cfg_nat *ucfg; struct cfg_redir_legacy *rdir; struct nat44_cfg_redir *urdir; char *buf; size_t len, len2; int error, i; len = sopt->sopt_valsize; len2 = len + 128; /* * Allocate 2x buffer to store converted structures. * new redir_cfg has shrunk, so we're sure that * new buffer size is enough. */ buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy)); if (error != 0) goto out; cfg = (struct cfg_nat_legacy *)buf; if (cfg->id < 0) { error = EINVAL; goto out; } ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)]; snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id); strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name)); ucfg->ip = cfg->ip; ucfg->mode = cfg->mode; ucfg->redir_cnt = cfg->redir_cnt; if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) { error = EINVAL; goto out; } urdir = (struct nat44_cfg_redir *)(ucfg + 1); rdir = (struct cfg_redir_legacy *)(cfg + 1); for (i = 0; i < cfg->redir_cnt; i++) { urdir->mode = rdir->mode; urdir->laddr = rdir->laddr; urdir->paddr = rdir->paddr; urdir->raddr = rdir->raddr; urdir->lport = rdir->lport; urdir->pport = rdir->pport; urdir->rport = rdir->rport; urdir->pport_cnt = rdir->pport_cnt; urdir->rport_cnt = rdir->rport_cnt; urdir->proto = rdir->proto; urdir->spool_cnt = rdir->spool_cnt; urdir++; rdir++; } nat44_config(&V_layer3_chain, ucfg); out: free(buf, M_TEMP); return (error); } static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; int i; sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (EINVAL); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static int ipfw_nat_get_cfg(struct sockopt *sopt) { struct ip_fw_chain *chain = &V_layer3_chain; struct cfg_nat *n; struct cfg_nat_legacy *ucfg; struct cfg_redir *r; struct cfg_spool *s; struct cfg_redir_legacy *ser_r; struct cfg_spool_legacy *ser_s; char *data; int gencnt, nat_cnt, len, error; nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); retry: gencnt = chain->gencnt; /* Estimate memory amount */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) len += sizeof(struct cfg_spool_legacy); } } IPFW_UH_RUNLOCK(chain); data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); bcopy(&nat_cnt, data, sizeof(nat_cnt)); nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); if (gencnt != chain->gencnt) { free(data, M_TEMP); goto retry; } /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { ucfg = (struct cfg_nat_legacy *)&data[len]; ucfg->id = n->id; ucfg->ip = n->ip; ucfg->redir_cnt = n->redir_cnt; ucfg->mode = n->mode; strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name)); len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { ser_r = (struct cfg_redir_legacy *)&data[len]; ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct cfg_spool_legacy *)&data[len]; ser_s->addr = s->addr; ser_s->port = s->port; len += sizeof(struct cfg_spool_legacy); } } } IPFW_UH_RUNLOCK(chain); error = sooptcopyout(sopt, data, len); free(data, M_TEMP); return (error); } static int ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; IPFW_RLOCK(chain); /* one pass to count, one to copy the data */ i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; i++; } size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(chain); return (ENOSPC); } i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); } static int vnet_ipfw_nat_init(const void *arg __unused) { V_ipfw_nat_ready = 1; return (0); } static int vnet_ipfw_nat_uninit(const void *arg __unused) { struct cfg_nat *ptr, *ptr_temp; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_WLOCK(chain); V_ipfw_nat_ready = 0; LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); free_nat_instance(ptr); } flush_nat_ptrs(chain, -1 /* flush all */); IPFW_WUNLOCK(chain); return (0); } static void ipfw_nat_init(void) { /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_ADD_SOPT_HANDLER(1, scodes); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); /* deregister ipfw_nat */ IPFW_DEL_SOPT_HANDLER(1, scodes); ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; ipfw_nat_del_ptr = NULL; ipfw_nat_get_cfg_ptr = NULL; ipfw_nat_get_log_ptr = NULL; } static int ipfw_nat_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: break; default: return EOPNOTSUPP; break; } return err; } static moduledata_t ipfw_nat_mod = { "ipfw_nat", ipfw_nat_modevent, 0 }; /* Define startup order. */ #define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ #define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1) #define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2) DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3); MODULE_VERSION(ipfw_nat, 1); SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_init, NULL); VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_init, NULL); SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL); /* end of file */ Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h =================================================================== --- projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h (revision 350434) +++ projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h (revision 350435) @@ -1,823 +1,824 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, IP_FW_NAT64, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { uint32_t flags; #define IPFW_ARGS_ETHER 0x00010000 /* valid ethernet header */ #define IPFW_ARGS_NH4 0x00020000 /* IPv4 next hop in hopstore */ #define IPFW_ARGS_NH6 0x00040000 /* IPv6 next hop in hopstore */ #define IPFW_ARGS_NH4PTR 0x00080000 /* IPv4 next hop in next_hop */ #define IPFW_ARGS_NH6PTR 0x00100000 /* IPv6 next hop in next_hop6 */ #define IPFW_ARGS_REF 0x00200000 /* valid ipfw_rule_ref */ #define IPFW_ARGS_IN 0x00400000 /* called on input */ #define IPFW_ARGS_OUT 0x00800000 /* called on output */ #define IPFW_ARGS_IP4 0x01000000 /* belongs to v4 ISR */ #define IPFW_ARGS_IP6 0x02000000 /* belongs to v6 ISR */ #define IPFW_ARGS_DROP 0x04000000 /* drop it (dummynet) */ #define IPFW_ARGS_LENMASK 0x0000ffff /* length of data in *mem */ #define IPFW_ARGS_LENGTH(f) ((f) & IPFW_ARGS_LENMASK) /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ struct ifnet *ifp; /* input/output interface */ struct inpcb *inp; union { /* * next_hop[6] pointers can be used to point to next hop * stored in rule's opcode to avoid copying into hopstore. * Also, it is expected that all 0x1-0x10 flags are mutually * exclusive. */ struct sockaddr_in *next_hop; struct sockaddr_in6 *next_hop6; /* ipfw next hop storage */ struct sockaddr_in hopstore; struct ip_fw_nh6 { struct in6_addr sin6_addr; uint32_t sin6_scope_id; uint16_t sin6_port; } hopstore6; }; union { struct mbuf *m; /* the mbuf chain */ void *mem; /* or memory pointer */ }; struct ipfw_flow_id f_id; /* grabbed from IP header */ }; MALLOC_DECLARE(M_IPFW); /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ int ipfw_chk(struct ip_fw_args *args); struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); int ipfw_attach_hooks(void); void ipfw_detach_hooks(void); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; struct ip_fw_chain; void ipfw_bpf_init(int); void ipfw_bpf_uninit(int); void ipfw_bpf_tap(u_char *, u_int); void ipfw_bpf_mtap(struct mbuf *); void ipfw_bpf_mtap2(void *, u_int, struct mbuf *); void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ struct sockopt_data; enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * Macro to determine that we need to do or redo dynamic state lookup. * direction == MATCH_UNKNOWN means that this is first lookup, then we need * to do lookup. * Otherwise check the state name, if previous lookup was for "any" name, * this means there is no state with specific name. Thus no need to do * lookup. If previous name was not "any", redo lookup for specific name. */ #define DYN_LOOKUP_NEEDED(p, cmd) \ ((p)->direction == MATCH_UNKNOWN || \ ((p)->kidx != 0 && (p)->kidx != (cmd)->arg1)) #define DYN_INFO_INIT(p) do { \ (p)->direction = MATCH_UNKNOWN; \ (p)->kidx = 0; \ } while (0) struct ipfw_dyn_info { uint16_t direction; /* match direction */ uint16_t kidx; /* state name kidx */ uint32_t hashval; /* hash value */ uint32_t version; /* bucket version */ uint32_t f_pos; }; int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, const ipfw_insn_limit *cmd, const struct ip_fw_args *args, const void *ulp, int pktlen, struct ipfw_dyn_info *info, uint32_t tablearg); struct ip_fw *ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info); int ipfw_is_dyn_rule(struct ip_fw *rule); void ipfw_expire_dyn_states(struct ip_fw_chain *, ipfw_range_tlv *); void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); uint32_t ipfw_dyn_get_count(uint32_t *, int *); void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(int, ipfw_vnet_ready); #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) VNET_DECLARE(unsigned int, fw_tables_sets); #define V_fw_tables_sets VNET(fw_tables_sets) struct tables_config; #ifdef _KERNEL /* * Here we have the structure representing an ipfw rule. * * It starts with a general area * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). */ struct ip_fw { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t flags; /* currently unused */ counter_u64_t cntr; /* Pointer to rule counters */ uint32_t timestamp; /* tv_sec of last match */ uint32_t id; /* rule id */ uint32_t cached_id; /* used by jump_fast */ uint32_t cached_pos; /* used by jump_fast */ uint32_t refcnt; /* number of references */ struct ip_fw *next; /* linked list of deleted rules */ ipfw_insn cmd[1]; /* storage for commands */ }; #define IPFW_RULE_CNTR_SIZE (2 * sizeof(uint64_t)) #endif struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ void *tablestate; /* runtime table info */ void *valuestate; /* runtime table value info */ int *idxmap; /* skipto array of rules */ void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else struct rmlock rwmtx; #endif int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct ip_fw *default_rule; struct tables_config *tblcfg; /* tables module data */ void *ifcfg; /* interface module data */ int *idxmap_back; /* standby skipto array of rules */ struct namedobj_instance *srvmap; /* cfg name->number mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else struct rwlock uh_lock; /* lock for upper half */ #endif }; /* 64-byte structure representing multi-field table value */ struct table_value { uint32_t tag; /* O_TAG/O_TAGGED */ uint32_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ uint16_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ uint32_t fib; /* O_SETFIB */ uint32_t nat; /* O_NAT */ uint32_t nh4; uint8_t dscp; uint8_t spare0; uint16_t spare1; /* -- 32 bytes -- */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t zoneid; /* scope zone id for nh6 */ uint64_t refcnt; /* Number of references */ }; struct named_object { TAILQ_ENTRY(named_object) nn_next; /* namehash */ TAILQ_ENTRY(named_object) nv_next; /* valuehash */ char *name; /* object name */ uint16_t etlv; /* Export TLV id */ uint8_t subtype;/* object subtype within class */ uint8_t set; /* set object belongs to */ uint16_t kidx; /* object kernel index */ uint16_t spare; uint32_t ocnt; /* object counter for internal use */ uint32_t refcnt; /* number of references */ }; TAILQ_HEAD(namedobjects_head, named_object); struct sockopt; /* used by tcp_var.h */ struct sockopt_data { caddr_t kbuf; /* allocated buffer */ size_t ksize; /* given buffer size */ size_t koff; /* data already used */ size_t kavail; /* number of bytes available */ size_t ktotal; /* total bytes pushed */ struct sockopt *sopt; /* socket data */ caddr_t sopt_val; /* sopt user buffer */ size_t valsize; /* original data size */ }; struct ipfw_ifc; typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); struct ipfw_iface { struct named_object no; char ifname[64]; int resolved; uint16_t ifindex; uint16_t spare; uint64_t gencnt; TAILQ_HEAD(, ipfw_ifc) consumers; }; struct ipfw_ifc { TAILQ_ENTRY(ipfw_ifc) next; struct ipfw_iface *iface; ipfw_ifc_cb *cb; void *cbdata; }; /* Macro for working with various counters */ #define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ counter_u64_add((_cntr)->cntr, 1); \ counter_u64_add((_cntr)->cntr + 1, _bytes); \ if ((_cntr)->timestamp != time_uptime) \ (_cntr)->timestamp = time_uptime; \ } while (0) #define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ (_cntr)->pcnt++; \ (_cntr)->bcnt += _bytes; \ } while (0) #define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ counter_u64_zero((_cntr)->cntr); \ counter_u64_zero((_cntr)->cntr + 1); \ (_cntr)->timestamp = 0; \ } while (0) #define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ (_cntr)->pcnt = 0; \ (_cntr)->bcnt = 0; \ } while (0) #define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f #define IP_FW_ARG_TABLEARG(ch, a, f) \ (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #if defined( __linux__ ) || defined( _WIN32 ) #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #else /* FreeBSD */ #define IPFW_LOCK_INIT(_chain) do { \ rm_init_flags(&(_chain)->rwmtx, "IPFW static rules", RM_RECURSE); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rm_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker #define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) #define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) #define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) #define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) #define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) struct obj_idx { uint16_t uidx; /* internal index supplied by userland */ uint16_t kidx; /* kernel object index */ uint16_t off; /* tlv offset from rule end in 4-byte words */ uint8_t spare; uint8_t type; /* object type within its category */ }; struct rule_check_info { uint16_t flags; /* rule-specific check flags */ uint16_t object_opcodes; /* num of opcodes referencing objects */ uint16_t urule_numoff; /* offset of rulenum in bytes */ uint8_t version; /* rule version */ uint8_t spare; ipfw_obj_ctlv *ctlv; /* name TLV containter */ struct ip_fw *krule; /* resulting rule pointer */ caddr_t urule; /* original rule pointer */ struct obj_idx obuf[8]; /* table references storage */ }; /* Legacy interface support */ /* * FreeBSD 8 export rule format */ struct ip_fw_rule0 { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; struct ip_fw_bcounter0 { uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ }; /* Kernel rule length */ /* * RULE _K_ SIZE _V_ -> * get kernel size from userland rool version _V_. * RULE _U_ SIZE _V_ -> * get user size version _V_ from kernel rule * RULESIZE _V_ -> * get user size rule length */ /* FreeBSD8 <> current kernel format */ #define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) #define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* FreeBSD11 <> current kernel format */ #define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ (r)->cmd_len * 4 - 4, 8)) #define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* * Tables/Objects index rewriting code */ /* Default and maximum number of ipfw tables/objects. */ #define IPFW_TABLES_MAX 65536 #define IPFW_TABLES_DEFAULT 128 #define IPFW_OBJECTS_MAX 65536 #define IPFW_OBJECTS_DEFAULT 1024 #define CHAIN_TO_SRV(ch) ((ch)->srvmap) #define SRV_OBJECT(ch, idx) ((ch)->srvstate[(idx)]) struct tid_info { uint32_t set; /* table set */ uint16_t uidx; /* table index */ uint8_t type; /* table type */ uint8_t atype; uint8_t spare; int tlen; /* Total TLV size block */ void *tlvs; /* Pointer to first TLV */ }; /* * Classifier callback. Checks if @cmd opcode contains kernel object reference. * If true, returns its index and type. * Returns 0 if match is found, 1 overwise. */ typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); /* * Updater callback. Sets kernel object reference index to @puidx */ typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx); /* * Finder callback. Tries to find named object by name (specified via @ti). * Stores found named object pointer in @pno. * If object was not found, NULL is stored. * * Return 0 if input data was valid. */ typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno); /* * Another finder callback. Tries to findex named object by kernel index. * * Returns pointer to named object or NULL. */ typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch, uint16_t kidx); /* * Object creator callback. Tries to create object specified by @ti. * Stores newly-allocated object index in @pkidx. * * Returns 0 on success. */ typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti, uint16_t *pkidx); /* * Object destroy callback. Intended to free resources allocated by * create_object callback. */ typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch, struct named_object *no); /* * Sets handler callback. Handles moving and swaping set of named object. * SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa; * TEST_ALL checks that there aren't any named object with conflicting names; * MOVE_ALL moves all named objects from set `set' to `new_set'; * COUNT_ONE used to count number of references used by object with kidx `set'; * TEST_ONE checks that named object with kidx `set' can be moved to `new_set`; * MOVE_ONE moves named object with kidx `set' to set `new_set'. */ enum ipfw_sets_cmd { SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE }; typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); struct opcode_obj_rewrite { uint32_t opcode; /* Opcode to act upon */ uint32_t etlv; /* Relevant export TLV id */ ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */ ipfw_obj_rw_upd *update; /* update cmd with new value */ ipfw_obj_fname_cb *find_byname; /* Find named object by name */ ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */ ipfw_obj_create_cb *create_object; /* Create named object */ ipfw_obj_destroy_cb *destroy_object;/* Destroy named object */ ipfw_obj_sets_cb *manage_sets; /* Swap or move sets */ }; #define IPFW_ADD_OBJ_REWRITER(f, c) do { \ if ((f) != 0) \ ipfw_add_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_OBJ_REWRITER(l, c) do { \ if ((l) != 0) \ ipfw_del_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) /* In ip_fw_iface.c */ int ipfw_iface_init(void); void ipfw_iface_destroy(void); void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic); void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); /* In ip_fw_sockopt.c */ void ipfw_init_skipto_cache(struct ip_fw_chain *chain); void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_ctl3(struct sockopt *sopt); int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule, int locked); void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule); void ipfw_reap_rules(struct ip_fw *head); void ipfw_init_counters(void); void ipfw_destroy_counters(void); struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); void ipfw_free_rule(struct ip_fw *rule); int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx); +ipfw_insn *ipfw_get_action(struct ip_fw *); typedef int (sopt_handler_f)(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); struct ipfw_sopt_handler { uint16_t opcode; uint8_t version; uint8_t dir; sopt_handler_f *handler; uint64_t refcnt; }; #define HDIR_SET 0x01 /* Handler is used to set some data */ #define HDIR_GET 0x02 /* Handler is used to retrieve data */ #define HDIR_BOTH HDIR_GET|HDIR_SET void ipfw_init_sopt_handler(void); void ipfw_destroy_sopt_handler(void); void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); #define IPFW_ADD_SOPT_HANDLER(f, c) do { \ if ((f) != 0) \ ipfw_add_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_SOPT_HANDLER(l, c) do { \ if ((l) != 0) \ ipfw_del_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) struct namedobj_instance; typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, void *arg); typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key, uint32_t kopt); typedef int (objhash_cmp_f)(struct named_object *no, const void *key, uint32_t kopt); struct namedobj_instance *ipfw_objhash_create(uint32_t items); void ipfw_objhash_destroy(struct namedobj_instance *); void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_free(void *idx, int blocks); void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name); struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, uint32_t type, const char *name); struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b); void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); uint32_t ipfw_objhash_count(struct namedobj_instance *ni); uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type); int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg); int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, void *arg, uint16_t type); int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, uint32_t etlv, struct named_object **pno); void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv); ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv); void ipfw_init_obj_rewriter(void); void ipfw_destroy_obj_rewriter(void); void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti); void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx); int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx); void ipfw_init_srv(struct ip_fw_chain *ch); void ipfw_destroy_srv(struct ip_fw_chain *ch); int ipfw_check_object_name_generic(const char *name); int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); /* In ip_fw_eaction.c */ typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_eaction_init(struct ip_fw_chain *ch, int first); void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last); uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name); int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id); int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule, uint16_t eaction_id, uint16_t default_id, uint16_t instance_id); int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id, uint16_t instance_id); /* In ip_fw_table.c */ struct table_info; typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, void *paddr, uint32_t *val); struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx); int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx); void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx); int ipfw_init_tables(struct ip_fw_chain *ch, int first); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); VNET_DECLARE(int, ipfw_nat_ready); #define V_ipfw_nat_ready VNET(ipfw_nat_ready) #define IPFW_NAT_LOADED (V_ipfw_nat_ready) extern ipfw_nat_t *ipfw_nat_ptr; extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; /* Helper functions for IP checksum adjustment */ static __inline uint16_t cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } static __inline uint16_t cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new) { return (~cksum_add(cksum_add(~oldsum, ~old), new)); } #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c =================================================================== --- projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c (revision 350434) +++ projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c (revision 350435) @@ -1,4686 +1,4715 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Supported by: Valeria Paoli * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Control socket and rule management routines for ipfw. * Control is currently implemented via IP_FW3 setsockopt() code. */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include /* struct m_tag used by nested headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* hooks */ #include #include #include #ifdef MAC #include #endif static int ipfw_ctl(struct sockopt *sopt); static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci); static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, struct rule_check_info *ci); static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, struct rule_check_info *ci); static int rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci); #define NAMEDOBJ_HASH_SIZE 32 struct namedobj_instance { struct namedobjects_head *names; struct namedobjects_head *values; uint32_t nn_size; /* names hash size */ uint32_t nv_size; /* number hash size */ u_long *idx_mask; /* used items bitmask */ uint32_t max_blocks; /* number of "long" blocks in bitmask */ uint32_t count; /* number of items */ uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */ objhash_hash_f *hash_f; objhash_cmp_f *cmp_f; }; #define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ static uint32_t objhash_hash_name(struct namedobj_instance *ni, const void *key, uint32_t kopt); static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val); static int objhash_cmp_name(struct named_object *no, const void *name, uint32_t set); MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd); /* ctl3 handler data */ struct mtx ctl3_lock; #define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF) #define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock) #define CTL3_LOCK() mtx_lock(&ctl3_lock) #define CTL3_UNLOCK() mtx_unlock(&ctl3_lock) static struct ipfw_sopt_handler *ctl3_handlers; static size_t ctl3_hsize; static uint64_t ctl3_refct, ctl3_gencnt; #define CTL3_SMALLBUF 4096 /* small page-size write buffer */ #define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */ static int ipfw_flush_sopt_data(struct sockopt_data *sd); static struct ipfw_sopt_handler scodes[] = { { IP_FW_XGET, 0, HDIR_GET, dump_config }, { IP_FW_XADD, 0, HDIR_BOTH, add_rules }, { IP_FW_XDEL, 0, HDIR_BOTH, del_rules }, { IP_FW_XZERO, 0, HDIR_SET, clear_rules }, { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules }, { IP_FW_XMOVE, 0, HDIR_SET, move_rules }, { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets }, { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets }, { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets }, { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, { IP_FW_DUMP_SRVOBJECTS,0, HDIR_GET, dump_srvobjects }, }; static int set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule); static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti); static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, struct obj_idx *pidx, int *unresolved); static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule); static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *end); static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, struct sockopt_data *sd); /* * Opcode object rewriter variables */ struct opcode_obj_rewrite *ctl3_rewriters; static size_t ctl3_rsize; /* * static variables followed by global ones */ VNET_DEFINE_STATIC(uma_zone_t, ipfw_cntr_zone); #define V_ipfw_cntr_zone VNET(ipfw_cntr_zone) void ipfw_init_counters() { V_ipfw_cntr_zone = uma_zcreate("IPFW counters", IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } void ipfw_destroy_counters() { uma_zdestroy(V_ipfw_cntr_zone); } struct ip_fw * ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize) { struct ip_fw *rule; rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO); rule->cntr = uma_zalloc_pcpu(V_ipfw_cntr_zone, M_WAITOK | M_ZERO); rule->refcnt = 1; return (rule); } void ipfw_free_rule(struct ip_fw *rule) { /* * We don't release refcnt here, since this function * can be called without any locks held. The caller * must release reference under IPFW_UH_WLOCK, and then * call this function if refcount becomes 1. */ if (rule->refcnt > 1) return; uma_zfree_pcpu(V_ipfw_cntr_zone, rule->cntr); free(rule, M_IPFW); } /* * Find the smallest rule >= key, id. * We could use bsearch but it is so simple that we code it directly */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) { int i, lo, hi; struct ip_fw *r; for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { i = (lo + hi) / 2; r = chain->map[i]; if (r->rulenum < key) lo = i + 1; /* continue from the next one */ else if (r->rulenum > key) hi = i; /* this might be good */ else if (r->id < id) lo = i + 1; /* continue from the next one */ else /* r->id >= id */ hi = i; /* this might be good */ } return hi; } /* * Builds skipto cache on rule set @map. */ static void update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map) { int *smap, rulenum; int i, mi; IPFW_UH_WLOCK_ASSERT(chain); mi = 0; rulenum = map[mi]->rulenum; smap = chain->idxmap_back; if (smap == NULL) return; for (i = 0; i < 65536; i++) { smap[i] = mi; /* Use the same rule index until i < rulenum */ if (i != rulenum || i == 65535) continue; /* Find next rule with num > i */ rulenum = map[++mi]->rulenum; while (rulenum == i) rulenum = map[++mi]->rulenum; } } /* * Swaps prepared (backup) index with current one. */ static void swap_skipto_cache(struct ip_fw_chain *chain) { int *map; IPFW_UH_WLOCK_ASSERT(chain); IPFW_WLOCK_ASSERT(chain); map = chain->idxmap; chain->idxmap = chain->idxmap_back; chain->idxmap_back = map; } /* * Allocate and initialize skipto cache. */ void ipfw_init_skipto_cache(struct ip_fw_chain *chain) { int *idxmap, *idxmap_back; idxmap = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK | M_ZERO); idxmap_back = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK); /* * Note we may be called at any time after initialization, * for example, on first skipto rule, so we need to * provide valid chain->idxmap on return */ IPFW_UH_WLOCK(chain); if (chain->idxmap != NULL) { IPFW_UH_WUNLOCK(chain); free(idxmap, M_IPFW); free(idxmap_back, M_IPFW); return; } /* Set backup pointer first to permit building cache */ chain->idxmap_back = idxmap_back; update_skipto_cache(chain, chain->map); IPFW_WLOCK(chain); /* It is now safe to set chain->idxmap ptr */ chain->idxmap = idxmap; swap_skipto_cache(chain); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Destroys skipto cache. */ void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain) { if (chain->idxmap != NULL) free(chain->idxmap, M_IPFW); if (chain->idxmap != NULL) free(chain->idxmap_back, M_IPFW); } /* * allocate a new map, returns the chain locked. extra is the number * of entries to add or delete. */ static struct ip_fw ** get_map(struct ip_fw_chain *chain, int extra, int locked) { for (;;) { struct ip_fw **map; u_int i, mflags; mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK); i = chain->n_rules + extra; map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; } if (!locked) IPFW_UH_WLOCK(chain); if (i >= chain->n_rules + extra) /* good */ return map; /* otherwise we lost the race, free and retry */ if (!locked) IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); } } /* * swap the maps. It is supposed to be called with IPFW_UH_WLOCK */ static struct ip_fw ** swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) { struct ip_fw **old_map; IPFW_WLOCK(chain); chain->id++; chain->n_rules = new_len; old_map = chain->map; chain->map = new_map; swap_skipto_cache(chain); IPFW_WUNLOCK(chain); return old_map; } static void export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr) { struct timeval boottime; cntr->size = sizeof(*cntr); if (krule->cntr != NULL) { cntr->pcnt = counter_u64_fetch(krule->cntr); cntr->bcnt = counter_u64_fetch(krule->cntr + 1); cntr->timestamp = krule->timestamp; } if (cntr->timestamp > 0) { getboottime(&boottime); cntr->timestamp += boottime.tv_sec; } } static void export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr) { struct timeval boottime; if (krule->cntr != NULL) { cntr->pcnt = counter_u64_fetch(krule->cntr); cntr->bcnt = counter_u64_fetch(krule->cntr + 1); cntr->timestamp = krule->timestamp; } if (cntr->timestamp > 0) { getboottime(&boottime); cntr->timestamp += boottime.tv_sec; } } /* * Copies rule @urule from v1 userland format (current). * to kernel @krule. * Assume @krule is zeroed. */ static void import_rule1(struct rule_check_info *ci) { struct ip_fw_rule *urule; struct ip_fw *krule; urule = (struct ip_fw_rule *)ci->urule; krule = (struct ip_fw *)ci->krule; /* copy header */ krule->act_ofs = urule->act_ofs; krule->cmd_len = urule->cmd_len; krule->rulenum = urule->rulenum; krule->set = urule->set; krule->flags = urule->flags; /* Save rulenum offset */ ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); /* Copy opcodes */ memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); } /* * Export rule into v1 format (Current). * Layout: * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT) * [ ip_fw_rule ] OR * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs). * ] * Assume @data is zeroed. */ static void export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs) { struct ip_fw_bcounter *cntr; struct ip_fw_rule *urule; ipfw_obj_tlv *tlv; /* Fill in TLV header */ tlv = (ipfw_obj_tlv *)data; tlv->type = IPFW_TLV_RULE_ENT; tlv->length = len; if (rcntrs != 0) { /* Copy counters */ cntr = (struct ip_fw_bcounter *)(tlv + 1); urule = (struct ip_fw_rule *)(cntr + 1); export_cntr1_base(krule, cntr); } else urule = (struct ip_fw_rule *)(tlv + 1); /* copy header */ urule->act_ofs = krule->act_ofs; urule->cmd_len = krule->cmd_len; urule->rulenum = krule->rulenum; urule->set = krule->set; urule->flags = krule->flags; urule->id = krule->id; /* Copy opcodes */ memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); } /* * Copies rule @urule from FreeBSD8 userland format (v0) * to kernel @krule. * Assume @krule is zeroed. */ static void import_rule0(struct rule_check_info *ci) { struct ip_fw_rule0 *urule; struct ip_fw *krule; int cmdlen, l; ipfw_insn *cmd; ipfw_insn_limit *lcmd; ipfw_insn_if *cmdif; urule = (struct ip_fw_rule0 *)ci->urule; krule = (struct ip_fw *)ci->krule; /* copy header */ krule->act_ofs = urule->act_ofs; krule->cmd_len = urule->cmd_len; krule->rulenum = urule->rulenum; krule->set = urule->set; if ((urule->_pad & 1) != 0) krule->flags |= IPFW_RULE_NOOPT; /* Save rulenum offset */ ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum); /* Copy opcodes */ memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); /* * Alter opcodes: * 1) convert tablearg value from 65535 to 0 * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room * for targ). * 3) convert table number in iface opcodes to u16 * 4) convert old `nat global` into new 65535 */ l = krule->cmd_len; cmd = krule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); switch (cmd->opcode) { /* Opcodes supporting tablearg */ case O_TAG: case O_TAGGED: case O_PIPE: case O_QUEUE: case O_DIVERT: case O_TEE: case O_SKIPTO: case O_CALLRETURN: case O_NETGRAPH: case O_NGTEE: case O_NAT: if (cmd->arg1 == IP_FW_TABLEARG) cmd->arg1 = IP_FW_TARG; else if (cmd->arg1 == 0) cmd->arg1 = IP_FW_NAT44_GLOBAL; break; case O_SETFIB: case O_SETDSCP: if (cmd->arg1 == IP_FW_TABLEARG) cmd->arg1 = IP_FW_TARG; else cmd->arg1 |= 0x8000; break; case O_LIMIT: lcmd = (ipfw_insn_limit *)cmd; if (lcmd->conn_limit == IP_FW_TABLEARG) lcmd->conn_limit = IP_FW_TARG; break; /* Interface tables */ case O_XMIT: case O_RECV: case O_VIA: /* Interface table, possibly */ cmdif = (ipfw_insn_if *)cmd; if (cmdif->name[0] != '\1') break; cmdif->p.kidx = (uint16_t)cmdif->p.glob; break; } } } /* * Copies rule @krule from kernel to FreeBSD8 userland format (v0) */ static void export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len) { int cmdlen, l; ipfw_insn *cmd; ipfw_insn_limit *lcmd; ipfw_insn_if *cmdif; /* copy header */ memset(urule, 0, len); urule->act_ofs = krule->act_ofs; urule->cmd_len = krule->cmd_len; urule->rulenum = krule->rulenum; urule->set = krule->set; if ((krule->flags & IPFW_RULE_NOOPT) != 0) urule->_pad |= 1; /* Copy opcodes */ memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); /* Export counters */ export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt); /* * Alter opcodes: * 1) convert tablearg value from 0 to 65535 * 2) Remove highest bit from O_SETFIB/O_SETDSCP values. * 3) convert table number in iface opcodes to int */ l = urule->cmd_len; cmd = urule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); switch (cmd->opcode) { /* Opcodes supporting tablearg */ case O_TAG: case O_TAGGED: case O_PIPE: case O_QUEUE: case O_DIVERT: case O_TEE: case O_SKIPTO: case O_CALLRETURN: case O_NETGRAPH: case O_NGTEE: case O_NAT: if (cmd->arg1 == IP_FW_TARG) cmd->arg1 = IP_FW_TABLEARG; else if (cmd->arg1 == IP_FW_NAT44_GLOBAL) cmd->arg1 = 0; break; case O_SETFIB: case O_SETDSCP: if (cmd->arg1 == IP_FW_TARG) cmd->arg1 = IP_FW_TABLEARG; else cmd->arg1 &= ~0x8000; break; case O_LIMIT: lcmd = (ipfw_insn_limit *)cmd; if (lcmd->conn_limit == IP_FW_TARG) lcmd->conn_limit = IP_FW_TABLEARG; break; /* Interface tables */ case O_XMIT: case O_RECV: case O_VIA: /* Interface table, possibly */ cmdif = (ipfw_insn_if *)cmd; if (cmdif->name[0] != '\1') break; cmdif->p.glob = cmdif->p.kidx; break; } } } /* * Add new rule(s) to the list possibly creating rule number for each. * Update the rule_number in the input struct so the caller knows it as well. * Must be called without IPFW_UH held */ static int commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) { int error, i, insert_before, tcount; uint16_t rulenum, *pnum; struct rule_check_info *ci; struct ip_fw *krule; struct ip_fw **map; /* the new array of pointers */ /* Check if we need to do table/obj index remap */ tcount = 0; for (ci = rci, i = 0; i < count; ci++, i++) { if (ci->object_opcodes == 0) continue; /* * Rule has some object opcodes. * We need to find (and create non-existing) * kernel objects, and reference existing ones. */ error = rewrite_rule_uidx(chain, ci); if (error != 0) { /* * rewrite failed, state for current rule * has been reverted. Check if we need to * revert more. */ if (tcount > 0) { /* * We have some more table rules * we need to rollback. */ IPFW_UH_WLOCK(chain); while (ci != rci) { ci--; if (ci->object_opcodes == 0) continue; unref_rule_objects(chain,ci->krule); } IPFW_UH_WUNLOCK(chain); } return (error); } tcount++; } /* get_map returns with IPFW_UH_WLOCK if successful */ map = get_map(chain, count, 0 /* not locked */); if (map == NULL) { if (tcount > 0) { /* Unbind tables */ IPFW_UH_WLOCK(chain); for (ci = rci, i = 0; i < count; ci++, i++) { if (ci->object_opcodes == 0) continue; unref_rule_objects(chain, ci->krule); } IPFW_UH_WUNLOCK(chain); } return (ENOSPC); } if (V_autoinc_step < 1) V_autoinc_step = 1; else if (V_autoinc_step > 1000) V_autoinc_step = 1000; /* FIXME: Handle count > 1 */ ci = rci; krule = ci->krule; rulenum = krule->rulenum; /* find the insertion point, we will insert before */ insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; i = ipfw_find_rule(chain, insert_before, 0); /* duplicate first part */ if (i > 0) bcopy(chain->map, map, i * sizeof(struct ip_fw *)); map[i] = krule; /* duplicate remaining part, we always have the default rule */ bcopy(chain->map + i, map + i + 1, sizeof(struct ip_fw *) *(chain->n_rules - i)); if (rulenum == 0) { /* Compute rule number and write it back */ rulenum = i > 0 ? map[i-1]->rulenum : 0; if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) rulenum += V_autoinc_step; krule->rulenum = rulenum; /* Save number to userland rule */ pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff); *pnum = rulenum; } krule->id = chain->id + 1; update_skipto_cache(chain, map); map = swap_map(chain, map, chain->n_rules + 1); chain->static_len += RULEUSIZE0(krule); IPFW_UH_WUNLOCK(chain); if (map) free(map, M_IPFW); return (0); } int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule, int locked) { struct ip_fw **map; map = get_map(chain, 1, locked); if (map == NULL) return (ENOMEM); if (chain->n_rules > 0) bcopy(chain->map, map, chain->n_rules * sizeof(struct ip_fw *)); map[chain->n_rules] = rule; rule->rulenum = IPFW_DEFAULT_RULE; rule->set = RESVD_SET; rule->id = chain->id + 1; /* We add rule in the end of chain, no need to update skipto cache */ map = swap_map(chain, map, chain->n_rules + 1); chain->static_len += RULEUSIZE0(rule); IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); return (0); } /* * Adds @rule to the list of rules to reap */ void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule) { IPFW_UH_WLOCK_ASSERT(chain); /* Unlink rule from everywhere */ unref_rule_objects(chain, rule); rule->next = *head; *head = rule; } /* * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. * A NULL pointer on input is handled correctly. */ void ipfw_reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; ipfw_free_rule(rule); } } /* * Rules to keep are * (default || reserved || !match_set || !match_number) * where * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) * // the default rule is always protected * * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") * * match_set ::= (cmd == 0 || rule->set == set) * // set number is ignored for cmd == 0 * * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) * // number is ignored for cmd == 1 or n == 0 * */ int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt) { /* Don't match default rule for modification queries */ if (rule->rulenum == IPFW_DEFAULT_RULE && (rt->flags & IPFW_RCFLAG_DEFAULT) == 0) return (0); /* Don't match rules in reserved set for flush requests */ if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET) return (0); /* If we're filtering by set, don't match other sets */ if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set) return (0); if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule)) return (0); return (1); } struct manage_sets_args { uint16_t set; uint8_t new_set; }; static int swap_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set == (uint8_t)args->set) no->set = args->new_set; else if (no->set == args->new_set) no->set = (uint8_t)args->set; return (0); } static int move_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set == (uint8_t)args->set) no->set = args->new_set; return (0); } static int test_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set != (uint8_t)args->set) return (0); if (ipfw_objhash_lookup_name_type(ni, args->new_set, no->etlv, no->name) != NULL) return (EEXIST); return (0); } /* * Generic function to handler moving and swapping sets. */ int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { struct manage_sets_args args; struct named_object *no; args.set = set; args.new_set = new_set; switch (cmd) { case SWAP_ALL: return (ipfw_objhash_foreach_type(ni, swap_sets_cb, &args, type)); case TEST_ALL: return (ipfw_objhash_foreach_type(ni, test_sets_cb, &args, type)); case MOVE_ALL: return (ipfw_objhash_foreach_type(ni, move_sets_cb, &args, type)); case COUNT_ONE: /* * @set used to pass kidx. * When @new_set is zero - reset object counter, * otherwise increment it. */ no = ipfw_objhash_lookup_kidx(ni, set); if (new_set != 0) no->ocnt++; else no->ocnt = 0; return (0); case TEST_ONE: /* @set used to pass kidx */ no = ipfw_objhash_lookup_kidx(ni, set); /* * First check number of references: * when it differs, this mean other rules are holding * reference to given object, so it is not possible to * change its set. Note that refcnt may account references * to some going-to-be-added rules. Since we don't know * their numbers (and even if they will be added) it is * perfectly OK to return error here. */ if (no->ocnt != no->refcnt) return (EBUSY); if (ipfw_objhash_lookup_name_type(ni, new_set, type, no->name) != NULL) return (EEXIST); return (0); case MOVE_ONE: /* @set used to pass kidx */ no = ipfw_objhash_lookup_kidx(ni, set); no->set = new_set; return (0); } return (EINVAL); } /* * Delete rules matching range @rt. * Saves number of deleted rules in @ndel. * * Returns 0 on success. */ static int delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel) { struct ip_fw *reap, *rule, **map; int end, start; int i, n, ndyn, ofs; reap = NULL; IPFW_UH_WLOCK(chain); /* arbitrate writers */ /* * Stage 1: Determine range to inspect. * Range is half-inclusive, e.g [start, end). */ start = 0; end = chain->n_rules - 1; if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) { start = ipfw_find_rule(chain, rt->start_rule, 0); if (rt->end_rule >= IPFW_DEFAULT_RULE) rt->end_rule = IPFW_DEFAULT_RULE - 1; end = ipfw_find_rule(chain, rt->end_rule, UINT32_MAX); } if (rt->flags & IPFW_RCFLAG_DYNAMIC) { /* * Requested deleting only for dynamic states. */ *ndel = 0; ipfw_expire_dyn_states(chain, rt); IPFW_UH_WUNLOCK(chain); return (0); } /* Allocate new map of the same size */ map = get_map(chain, 0, 1 /* locked */); if (map == NULL) { IPFW_UH_WUNLOCK(chain); return (ENOMEM); } n = 0; ndyn = 0; ofs = start; /* 1. bcopy the initial part of the map */ if (start > 0) bcopy(chain->map, map, start * sizeof(struct ip_fw *)); /* 2. copy active rules between start and end */ for (i = start; i < end; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) { map[ofs++] = rule; continue; } n++; if (ipfw_is_dyn_rule(rule) != 0) ndyn++; } /* 3. copy the final part of the map */ bcopy(chain->map + end, map + ofs, (chain->n_rules - end) * sizeof(struct ip_fw *)); /* 4. recalculate skipto cache */ update_skipto_cache(chain, map); /* 5. swap the maps (under UH_WLOCK + WHLOCK) */ map = swap_map(chain, map, chain->n_rules - n); /* 6. Remove all dynamic states originated by deleted rules */ if (ndyn > 0) ipfw_expire_dyn_states(chain, rt); /* 7. now remove the rules deleted from the old map */ for (i = start; i < end; i++) { rule = map[i]; if (ipfw_match_range(rule, rt) == 0) continue; chain->static_len -= RULEUSIZE0(rule); ipfw_reap_add(chain, &reap, rule); } IPFW_UH_WUNLOCK(chain); ipfw_reap_rules(reap); if (map != NULL) free(map, M_IPFW); *ndel = n; return (0); } static int move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt) { struct opcode_obj_rewrite *rw; struct ip_fw *rule; ipfw_insn *cmd; int cmdlen, i, l, c; uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); /* Stage 1: count number of references by given rules */ for (c = 0, i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* * When manage_sets() returns non-zero value to * COUNT_ONE command, consider this as an object * doesn't support sets (e.g. disabled with sysctl). * So, skip checks for this object. */ if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0) continue; c++; } } if (c == 0) /* No objects found */ return (0); /* Stage 2: verify "ownership" */ for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* Test for ownership and conflicting names */ c = rw->manage_sets(ch, kidx, (uint8_t)rt->new_set, TEST_ONE); } } /* Stage 3: change set and cleanup */ for (i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* cleanup object counter */ rw->manage_sets(ch, kidx, 0 /* reset counter */, COUNT_ONE); if (c != 0) continue; /* change set */ rw->manage_sets(ch, kidx, (uint8_t)rt->new_set, MOVE_ONE); } } return (c); } /* * Changes set of given rule rannge @rt * with each other. * * Returns 0 on success. */ static int move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { struct ip_fw *rule; int i; IPFW_UH_WLOCK(chain); /* * Move rules with matching paramenerts to a new set. * This one is much more complex. We have to ensure * that all referenced tables (if any) are referenced * by given rule subset only. Otherwise, we can't move * them to new set and have to return error. */ if ((i = move_objects(chain, rt)) != 0) { IPFW_UH_WUNLOCK(chain); return (i); } /* XXX: We have to do swap holding WLOCK */ for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; rule->set = rt->new_set; } IPFW_UH_WUNLOCK(chain); return (0); } /* + * Returns pointer to action instruction, skips all possible rule + * modifiers like O_LOG, O_TAG, O_ALTQ. + */ +ipfw_insn * +ipfw_get_action(struct ip_fw *rule) +{ + ipfw_insn *cmd; + int l, cmdlen; + + cmd = ACTION_PTR(rule); + l = rule->cmd_len - rule->act_ofs; + while (l > 0) { + switch (cmd->opcode) { + case O_ALTQ: + case O_LOG: + case O_TAG: + break; + default: + return (cmd); + } + cmdlen = F_LEN(cmd); + l -= cmdlen; + cmd += cmdlen; + } + panic("%s: rule (%p) has not action opcode", __func__, rule); + return (NULL); +} + +/* * Clear counters for a specific rule. * Normally run under IPFW_UH_RLOCK, but these are idempotent ops * so we only care that rules do not disappear. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) IPFW_ZERO_RULE_COUNTER(rule); if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /* * Flushes rules counters and/or log values on matching range. * * Returns number of items cleared. */ static int clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only) { struct ip_fw *rule; int num; int i; num = 0; rt->flags |= IPFW_RCFLAG_DEFAULT; IPFW_UH_WLOCK(chain); /* arbitrate writers */ for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; clear_counters(rule, log_only); num++; } IPFW_UH_WUNLOCK(chain); return (num); } static int check_range_tlv(ipfw_range_tlv *rt) { if (rt->head.length != sizeof(*rt)) return (1); if (rt->start_rule > rt->end_rule) return (1); if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) return (1); if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) return (1); return (0); } /* * Delete rules matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * Reply: [ ipfw_obj_header ipfw_range_tlv ] * * Saves number of deleted rules in ipfw_range_tlv->new_set. * * Returns 0 on success. */ static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int error, ndel; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); ndel = 0; if ((error = delete_range(chain, &rh->range, &ndel)) != 0) return (error); /* Save number of rules deleted */ rh->range.new_set = ndel; return (0); } /* * Move rules/sets matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * * Returns 0 on success. */ static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); return (move_range(chain, &rh->range)); } /* * Clear rule accounting data matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * Reply: [ ipfw_obj_header ipfw_range_tlv ] * * Saves number of cleared rules in ipfw_range_tlv->new_set. * * Returns 0 on success. */ static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int log_only, num; char *msg; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); log_only = (op3->opcode == IP_FW_XRESETLOG); num = clear_range(chain, &rh->range, log_only); if (rh->range.flags & IPFW_RCFLAG_ALL) msg = log_only ? "All logging counts reset" : "Accounting cleared"; else msg = log_only ? "logging count reset" : "cleared"; if (V_fw_verbose) { int lev = LOG_SECURITY | LOG_NOTICE; log(lev, "ipfw: %s.\n", msg); } /* Save number of rules cleared */ rh->range.new_set = num; return (0); } static void enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { uint32_t v_set; IPFW_UH_WLOCK_ASSERT(chain); /* Change enabled/disabled sets mask */ v_set = (V_set_disable | rt->set) & ~rt->new_set; v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */ IPFW_WLOCK(chain); V_set_disable = v_set; IPFW_WUNLOCK(chain); } static int swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv) { struct opcode_obj_rewrite *rw; struct ip_fw *rule; int i; IPFW_UH_WLOCK_ASSERT(chain); if (rt->set == rt->new_set) /* nothing to do */ return (0); if (mv != 0) { /* * Berfore moving the rules we need to check that * there aren't any conflicting named objects. */ for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) { if (rw->manage_sets == NULL) continue; i = rw->manage_sets(chain, (uint8_t)rt->set, (uint8_t)rt->new_set, TEST_ALL); if (i != 0) return (EEXIST); } } /* Swap or move two sets */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == (uint8_t)rt->set) rule->set = (uint8_t)rt->new_set; else if (rule->set == (uint8_t)rt->new_set && mv == 0) rule->set = (uint8_t)rt->set; } for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) { if (rw->manage_sets == NULL) continue; rw->manage_sets(chain, (uint8_t)rt->set, (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL); } return (0); } /* * Swaps or moves set * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * * Returns 0 on success. */ static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int ret; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (rh->range.head.length != sizeof(ipfw_range_tlv)) return (1); /* enable_sets() expects bitmasks. */ if (op3->opcode != IP_FW_SET_ENABLE && (rh->range.set >= IPFW_MAX_SETS || rh->range.new_set >= IPFW_MAX_SETS)) return (EINVAL); ret = 0; IPFW_UH_WLOCK(chain); switch (op3->opcode) { case IP_FW_SET_SWAP: case IP_FW_SET_MOVE: ret = swap_sets(chain, &rh->range, op3->opcode == IP_FW_SET_MOVE); break; case IP_FW_SET_ENABLE: enable_sets(chain, &rh->range); break; } IPFW_UH_WUNLOCK(chain); return (ret); } /** * Remove all rules with given number, or do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an uint32_t. The low 16 bit are the rule or set number; * the next 8 bits are the new set; the top 8 bits indicate the command: * * 0 delete rules numbered "rulenum" * 1 delete rules in set "rulenum" * 2 move rules "rulenum" to set "new_set" * 3 move rules from set "rulenum" to set "new_set" * 4 swap sets "rulenum" and "new_set" * 5 delete rules "rulenum" and set "new_set" */ static int del_entry(struct ip_fw_chain *chain, uint32_t arg) { uint32_t num; /* rule number or old_set */ uint8_t cmd, new_set; int do_del, ndel; int error = 0; ipfw_range_tlv rt; num = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 5 || new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2 || cmd == 5) { if (num >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (num > RESVD_SET) /* old_set */ return EINVAL; } /* Convert old requests into new representation */ memset(&rt, 0, sizeof(rt)); rt.start_rule = num; rt.end_rule = num; rt.set = num; rt.new_set = new_set; do_del = 0; switch (cmd) { case 0: /* delete rules numbered "rulenum" */ if (num == 0) rt.flags |= IPFW_RCFLAG_ALL; else rt.flags |= IPFW_RCFLAG_RANGE; do_del = 1; break; case 1: /* delete rules in set "rulenum" */ rt.flags |= IPFW_RCFLAG_SET; do_del = 1; break; case 5: /* delete rules "rulenum" and set "new_set" */ rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET; rt.set = new_set; rt.new_set = 0; do_del = 1; break; case 2: /* move rules "rulenum" to set "new_set" */ rt.flags |= IPFW_RCFLAG_RANGE; break; case 3: /* move rules from set "rulenum" to set "new_set" */ IPFW_UH_WLOCK(chain); error = swap_sets(chain, &rt, 1); IPFW_UH_WUNLOCK(chain); return (error); case 4: /* swap sets "rulenum" and "new_set" */ IPFW_UH_WLOCK(chain); error = swap_sets(chain, &rt, 0); IPFW_UH_WUNLOCK(chain); return (error); default: return (ENOTSUP); } if (do_del != 0) { if ((error = delete_range(chain, &rt, &ndel)) != 0) return (error); if (ndel == 0 && (cmd != 1 && num != 0)) return (EINVAL); return (0); } return (move_range(chain, &rt)); } /** * Reset some or all counters on firewall rules. * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, * the next 8 bits are the set number, the top 8 bits are the command: * 0 work with rules from all set's; * 1 work with rules only from specified set. * Specified rule number is zero if we want to clear all entries. * log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { struct ip_fw *rule; char *msg; int i; uint16_t rulenum = arg & 0xffff; uint8_t set = (arg >> 16) & 0xff; uint8_t cmd = (arg >> 24) & 0xff; if (cmd > 1) return (EINVAL); if (cmd == 1 && set > RESVD_SET) return (EINVAL); IPFW_UH_RLOCK(chain); if (rulenum == 0) { V_norule_counter = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; /* Skip rules not in our set. */ if (cmd == 1 && rule->set != set) continue; clear_counters(rule, log_only); } msg = log_only ? "All logging counts reset" : "Accounting cleared"; } else { int cleared = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (rule->rulenum == rulenum) { if (cmd == 0 || rule->set == set) clear_counters(rule, log_only); cleared = 1; } if (rule->rulenum > rulenum) break; } if (!cleared) { /* we did not find any matching rules */ IPFW_UH_RUNLOCK(chain); return (EINVAL); } msg = log_only ? "logging count reset" : "cleared"; } IPFW_UH_RUNLOCK(chain); if (V_fw_verbose) { int lev = LOG_SECURITY | LOG_NOTICE; if (rulenum) log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); else log(lev, "ipfw: %s.\n", msg); } return (0); } /* * Check rule head in FreeBSD11 format * */ static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, struct rule_check_info *ci) { int l; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* Check for valid cmd_len */ l = roundup2(RULESIZE(rule), sizeof(uint64_t)); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } if (rule->rulenum > IPFW_DEFAULT_RULE - 1) return (EINVAL); return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); } /* * Check rule head in FreeBSD8 format * */ static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, struct rule_check_info *ci) { int l; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* Check for valid cmd_len */ l = sizeof(*rule) + rule->cmd_len * 4 - 4; if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } if (rule->rulenum > IPFW_DEFAULT_RULE - 1) return (EINVAL); return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); } static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) { int cmdlen, l; int have_action; have_action = 0; /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; ci->object_opcodes++; break; case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_SOCKARG: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: case O_TAG: case O_SKIP_ACTION: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_EXTERNAL_ACTION: if (cmd->arg1 == 0 || cmdlen != F_INSN_SIZE(ipfw_insn)) { printf("ipfw: invalid external " "action opcode\n"); return (EINVAL); } ci->object_opcodes++; /* * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA * opcode? */ if (l != cmdlen) { l -= cmdlen; cmd += cmdlen; cmdlen = F_LEN(cmd); if (cmd->opcode == O_EXTERNAL_DATA) goto check_action; if (cmd->opcode != O_EXTERNAL_INSTANCE) { printf("ipfw: invalid opcode " "next to external action %u\n", cmd->opcode); return (EINVAL); } if (cmd->arg1 == 0 || cmdlen != F_INSN_SIZE(ipfw_insn)) { printf("ipfw: invalid external " "action instance opcode\n"); return (EINVAL); } ci->object_opcodes++; } goto check_action; case O_FIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if (cmd->arg1 >= rt_numfibs) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; } break; case O_SETFIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if ((cmd->arg1 != IP_FW_TARG) && ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) { printf("ipfw: invalid fib number %d\n", cmd->arg1 & 0x7FFF); return EINVAL; } goto check_action; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; ci->object_opcodes++; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ((cmdlen & 1) == 0) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; case O_IP_DST_LOOKUP: if (cmd->arg1 >= V_fw_tables_max) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; ci->object_opcodes++; break; case O_IP_FLOW_LOOKUP: if (cmd->arg1 >= V_fw_tables_max) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; ci->object_opcodes++; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TCPMSS: case O_TCPWIN: case O_TAGGED: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_DSCP: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; ci->object_opcodes++; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #ifdef INET6 case O_FORWARD_IP6: if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) goto bad_size; goto check_action; #endif /* INET6 */ case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (ng_ipfw_input_p == NULL) return EINVAL; else goto check_size; case O_NAT: if (!IPFW_NAT_LOADED) return EINVAL; if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; case O_CHECK_STATE: ci->object_opcodes++; /* FALLTHROUGH */ case O_FORWARD_MAC: /* XXX not implemented yet */ case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: case O_SETDSCP: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: case O_REASS: case O_CALLRETURN: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return (EINVAL); } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return (EINVAL); } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return (EPROTONOSUPPORT); #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return (EINVAL); } } } if (have_action == 0) { printf("ipfw: missing action\n"); return (EINVAL); } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return (EINVAL); } /* * Translation of requests for compatibility with FreeBSD 7.2/8. * a static variable tells us if we have an old client from userland, * and if necessary we translate requests and responses between the * two formats. */ static int is7 = 0; struct ip_fw7 { struct ip_fw7 *next; /* linked list of rules */ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ // #define RESVD_SET 31 /* set for default and persistent rules */ uint8_t _pad; /* padding */ // uint32_t id; /* rule id, only in v.8 */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; static int convert_rule_to_7(struct ip_fw_rule0 *rule); static int convert_rule_to_8(struct ip_fw_rule0 *rule); #ifndef RULESIZE7 #define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) #endif /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. * Must be run under IPFW_UH_RLOCK */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule; struct ip_fw_rule0 *dst; struct timeval boottime; int error, i, l, warnflag; time_t boot_seconds; warnflag = 0; getboottime(&boottime); boot_seconds = boottime.tv_sec; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (is7) { /* Convert rule to FreeBSd 7.2 format */ l = RULESIZE7(rule); if (bp + l + sizeof(uint32_t) <= ep) { bcopy(rule, bp, l + sizeof(uint32_t)); error = set_legacy_obj_kidx(chain, (struct ip_fw_rule0 *)bp); if (error != 0) return (0); error = convert_rule_to_7((struct ip_fw_rule0 *) bp); if (error) return 0; /*XXX correct? */ /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&V_set_disable, &(((struct ip_fw7 *)bp)->next_rule), sizeof(V_set_disable)); if (((struct ip_fw7 *)bp)->timestamp) ((struct ip_fw7 *)bp)->timestamp += boot_seconds; bp += l; } continue; /* go to next rule */ } l = RULEUSIZE0(rule); if (bp + l > ep) { /* should not happen */ printf("overflow dumping static rules\n"); break; } dst = (struct ip_fw_rule0 *)bp; export_rule0(rule, dst, l); error = set_legacy_obj_kidx(chain, dst); /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? * * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask * so we need to fail _after_ saving at least one mask. */ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); if (dst->timestamp) dst->timestamp += boot_seconds; bp += l; if (error != 0) { if (error == 2) { /* Non-fatal table rewrite error. */ warnflag = 1; continue; } printf("Stop on rule %d. Fail to convert table\n", rule->rulenum); break; } } if (warnflag != 0) printf("ipfw: process %s is using legacy interfaces," " consider rebuilding\n", ""); ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */ return (bp - (char *)buf); } struct dump_args { uint32_t b; /* start rule */ uint32_t e; /* end rule */ uint32_t rcount; /* number of rules */ uint32_t rsize; /* rules size */ uint32_t tcount; /* number of tables */ int rcounters; /* counters */ uint32_t *bmask; /* index bitmask of used named objects */ }; void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv) { ntlv->head.type = no->etlv; ntlv->head.length = sizeof(*ntlv); ntlv->idx = no->kidx; strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); } /* * Export named object info in instance @ni, identified by @kidx * to ipfw_obj_ntlv. TLV is allocated from @sd space. * * Returns 0 on success. */ static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, struct sockopt_data *sd) { struct named_object *no; ipfw_obj_ntlv *ntlv; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("invalid object kernel index passed")); ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ipfw_export_obj_ntlv(no, ntlv); return (0); } static int export_named_objects(struct namedobj_instance *ni, struct dump_args *da, struct sockopt_data *sd) { int error, i; for (i = 0; i < IPFW_TABLES_MAX && da->tcount > 0; i++) { if ((da->bmask[i / 32] & (1 << (i % 32))) == 0) continue; if ((error = export_objhash_ntlv(ni, i, sd)) != 0) return (error); da->tcount--; } return (0); } static int dump_named_objects(struct ip_fw_chain *ch, struct dump_args *da, struct sockopt_data *sd) { ipfw_obj_ctlv *ctlv; int error; MPASS(da->tcount > 0); /* Header first */ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_TBLNAME_LIST; ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + sizeof(*ctlv); ctlv->count = da->tcount; ctlv->objsize = sizeof(ipfw_obj_ntlv); /* Dump table names first (if any) */ error = export_named_objects(ipfw_get_table_objhash(ch), da, sd); if (error != 0) return (error); /* Then dump another named objects */ da->bmask += IPFW_TABLES_MAX / 32; return (export_named_objects(CHAIN_TO_SRV(ch), da, sd)); } /* * Dumps static rules with table TLVs in buffer @sd. * * Returns 0 on success. */ static int dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, struct sockopt_data *sd) { ipfw_obj_ctlv *ctlv; struct ip_fw *krule; caddr_t dst; int i, l; /* Dump rules */ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = da->rsize + sizeof(*ctlv); ctlv->count = da->rcount; for (i = da->b; i < da->e; i++) { krule = chain->map[i]; l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv); if (da->rcounters != 0) l += sizeof(struct ip_fw_bcounter); dst = (caddr_t)ipfw_get_sopt_space(sd, l); if (dst == NULL) return (ENOMEM); export_rule1(krule, dst, l, da->rcounters); } return (0); } int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx) { uint32_t bidx; /* * Maintain separate bitmasks for table and non-table objects. */ bidx = (etlv == IPFW_TLV_TBL_NAME) ? 0: IPFW_TABLES_MAX / 32; bidx += kidx / 32; if ((bmask[bidx] & (1 << (kidx % 32))) != 0) return (0); bmask[bidx] |= 1 << (kidx % 32); return (1); } /* * Marks every object index used in @rule with bit in @bmask. * Used to generate bitmask of referenced tables/objects for given ruleset * or its part. */ static void mark_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct dump_args *da) { struct opcode_obj_rewrite *rw; ipfw_insn *cmd; int cmdlen, l; uint16_t kidx; uint8_t subtype; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, &subtype); if (rw == NULL) continue; if (ipfw_mark_object_kidx(da->bmask, rw->etlv, kidx)) da->tcount++; } } /* * Dumps requested objects data * Data layout (version 0)(current): * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags * size = ipfw_cfg_lheader.size * Reply: [ ipfw_cfg_lheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ] * ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional) * ] * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize. * The rest (size, count) are set to zero and needs to be ignored. * * Returns 0 on success. */ static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct dump_args da; ipfw_cfg_lheader *hdr; struct ip_fw *rule; size_t sz, rnum; uint32_t hdr_flags, *bmask; int error, i; hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); if (hdr == NULL) return (EINVAL); error = 0; bmask = NULL; memset(&da, 0, sizeof(da)); /* * Allocate needed state. * Note we allocate 2xspace mask, for table & srv */ if (hdr->flags & (IPFW_CFG_GET_STATIC | IPFW_CFG_GET_STATES)) da.bmask = bmask = malloc( sizeof(uint32_t) * IPFW_TABLES_MAX * 2 / 32, M_TEMP, M_WAITOK | M_ZERO); IPFW_UH_RLOCK(chain); /* * STAGE 1: Determine size/count for objects in range. * Prepare used tables bitmask. */ sz = sizeof(ipfw_cfg_lheader); da.e = chain->n_rules; if (hdr->end_rule != 0) { /* Handle custom range */ if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE) rnum = IPFW_DEFAULT_RULE; da.b = ipfw_find_rule(chain, rnum, 0); rnum = (hdr->end_rule < IPFW_DEFAULT_RULE) ? hdr->end_rule + 1: IPFW_DEFAULT_RULE; da.e = ipfw_find_rule(chain, rnum, UINT32_MAX) + 1; } if (hdr->flags & IPFW_CFG_GET_STATIC) { for (i = da.b; i < da.e; i++) { rule = chain->map[i]; da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); da.rcount++; /* Update bitmask of used objects for given range */ mark_rule_objects(chain, rule, &da); } /* Add counters if requested */ if (hdr->flags & IPFW_CFG_GET_COUNTERS) { da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount; da.rcounters = 1; } sz += da.rsize + sizeof(ipfw_obj_ctlv); } if (hdr->flags & IPFW_CFG_GET_STATES) { sz += sizeof(ipfw_obj_ctlv) + ipfw_dyn_get_count(bmask, &i) * sizeof(ipfw_obj_dyntlv); da.tcount += i; } if (da.tcount > 0) sz += da.tcount * sizeof(ipfw_obj_ntlv) + sizeof(ipfw_obj_ctlv); /* * Fill header anyway. * Note we have to save header fields to stable storage * buffer inside @sd can be flushed after dumping rules */ hdr->size = sz; hdr->set_mask = ~V_set_disable; hdr_flags = hdr->flags; hdr = NULL; if (sd->valsize < sz) { error = ENOMEM; goto cleanup; } /* STAGE2: Store actual data */ if (da.tcount > 0) { error = dump_named_objects(chain, &da, sd); if (error != 0) goto cleanup; } if (hdr_flags & IPFW_CFG_GET_STATIC) { error = dump_static_rules(chain, &da, sd); if (error != 0) goto cleanup; } if (hdr_flags & IPFW_CFG_GET_STATES) error = ipfw_dump_states(chain, sd); cleanup: IPFW_UH_RUNLOCK(chain); if (bmask != NULL) free(bmask, M_TEMP); return (error); } int ipfw_check_object_name_generic(const char *name) { int nsize; nsize = sizeof(((ipfw_obj_ntlv *)0)->name); if (strnlen(name, nsize) == nsize) return (EINVAL); if (name[0] == '\0') return (EINVAL); return (0); } /* * Creates non-existent objects referenced by rule. * * Return 0 on success. */ int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti) { struct opcode_obj_rewrite *rw; struct obj_idx *p; uint16_t kidx; int error; /* * Compatibility stuff: do actual creation for non-existing, * but referenced objects. */ for (p = oib; p < pidx; p++) { if (p->kidx != 0) continue; ti->uidx = p->uidx; ti->type = p->type; ti->atype = 0; rw = find_op_rw(cmd + p->off, NULL, NULL); KASSERT(rw != NULL, ("Unable to find handler for op %d", (cmd + p->off)->opcode)); if (rw->create_object == NULL) error = EOPNOTSUPP; else error = rw->create_object(ch, ti, &kidx); if (error == 0) { p->kidx = kidx; continue; } /* * Error happened. We have to rollback everything. * Drop all already acquired references. */ IPFW_UH_WLOCK(ch); unref_oib_objects(ch, cmd, oib, pidx); IPFW_UH_WUNLOCK(ch); return (error); } return (0); } /* * Compatibility function for old ipfw(8) binaries. * Rewrites table/nat kernel indices with userland ones. * Convert tables matching '/^\d+$/' to their atoi() value. * Use number 65535 for other tables. * * Returns 0 on success. */ static int set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule) { struct opcode_obj_rewrite *rw; struct named_object *no; ipfw_insn *cmd; char *end; long val; int cmdlen, error, l; uint16_t kidx, uidx; uint8_t subtype; error = 0; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); /* Check if is index in given opcode */ rw = find_op_rw(cmd, &kidx, &subtype); if (rw == NULL) continue; /* Try to find referenced kernel object */ no = rw->find_bykidx(ch, kidx); if (no == NULL) continue; val = strtol(no->name, &end, 10); if (*end == '\0' && val < 65535) { uidx = val; } else { /* * We are called via legacy opcode. * Save error and show table as fake number * not to make ipfw(8) hang. */ uidx = 65535; error = 2; } rw->update(cmd, uidx); } return (error); } /* * Unreferences all already-referenced objects in given @cmd rule, * using information in @oib. * * Used to rollback partially converted rule on error. */ static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *end) { struct opcode_obj_rewrite *rw; struct named_object *no; struct obj_idx *p; IPFW_UH_WLOCK_ASSERT(ch); for (p = oib; p < end; p++) { if (p->kidx == 0) continue; rw = find_op_rw(cmd + p->off, NULL, NULL); KASSERT(rw != NULL, ("Unable to find handler for op %d", (cmd + p->off)->opcode)); /* Find & unref by existing idx */ no = rw->find_bykidx(ch, p->kidx); KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx)); no->refcnt--; } } /* * Remove references from every object used in @rule. * Used at rule removal code. */ static void unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule) { struct opcode_obj_rewrite *rw; struct named_object *no; ipfw_insn *cmd; int cmdlen, l; uint16_t kidx; uint8_t subtype; IPFW_UH_WLOCK_ASSERT(ch); l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, &subtype); if (rw == NULL) continue; no = rw->find_bykidx(ch, kidx); KASSERT(no != NULL, ("object id %d not found", kidx)); KASSERT(no->subtype == subtype, ("wrong type %d (%d) for object id %d", no->subtype, subtype, kidx)); KASSERT(no->refcnt > 0, ("refcount for object %d is %d", kidx, no->refcnt)); if (no->refcnt == 1 && rw->destroy_object != NULL) rw->destroy_object(ch, no); else no->refcnt--; } } /* * Find and reference object (if any) stored in instruction @cmd. * * Saves object info in @pidx, sets * - @unresolved to 1 if object should exists but not found * * Returns non-zero value in case of error. */ static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, struct obj_idx *pidx, int *unresolved) { struct named_object *no; struct opcode_obj_rewrite *rw; int error; /* Check if this opcode is candidate for rewrite */ rw = find_op_rw(cmd, &ti->uidx, &ti->type); if (rw == NULL) return (0); /* Need to rewrite. Save necessary fields */ pidx->uidx = ti->uidx; pidx->type = ti->type; /* Try to find referenced kernel object */ error = rw->find_byname(ch, ti, &no); if (error != 0) return (error); if (no == NULL) { /* * Report about unresolved object for automaic * creation. */ *unresolved = 1; return (0); } /* * Object is already exist. * Its subtype should match with expected value. */ if (ti->type != no->subtype) return (EINVAL); /* Bump refcount and update kidx. */ no->refcnt++; rw->update(cmd, no->kidx); return (0); } /* * Finds and bumps refcount for objects referenced by given @rule. * Auto-creates non-existing tables. * Fills in @oib array with userland/kernel indexes. * * Returns 0 on success. */ static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti) { struct obj_idx *pidx; ipfw_insn *cmd; int cmdlen, error, l, unresolved; pidx = oib; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; error = 0; IPFW_UH_WLOCK(ch); /* Increase refcount on each existing referenced table. */ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); unresolved = 0; error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved); if (error != 0) break; /* * Compatibility stuff for old clients: * prepare to automaitcally create non-existing objects. */ if (unresolved != 0) { pidx->off = rule->cmd_len - l; pidx++; } } if (error != 0) { /* Unref everything we have already done */ unref_oib_objects(ch, rule->cmd, oib, pidx); IPFW_UH_WUNLOCK(ch); return (error); } IPFW_UH_WUNLOCK(ch); /* Perform auto-creation for non-existing objects */ if (pidx != oib) error = create_objects_compat(ch, rule->cmd, oib, pidx, ti); /* Calculate real number of dynamic objects */ ci->object_opcodes = (uint16_t)(pidx - oib); return (error); } /* * Checks is opcode is referencing table of appropriate type. * Adds reference count for found table if true. * Rewrites user-supplied opcode values with kernel ones. * * Returns 0 on success and appropriate error code otherwise. */ static int rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci) { int error; ipfw_insn *cmd; uint8_t type; struct obj_idx *p, *pidx_first, *pidx_last; struct tid_info ti; /* * Prepare an array for storing opcode indices. * Use stack allocation by default. */ if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { /* Stack */ pidx_first = ci->obuf; } else pidx_first = malloc( ci->object_opcodes * sizeof(struct obj_idx), M_IPFW, M_WAITOK | M_ZERO); error = 0; type = 0; memset(&ti, 0, sizeof(ti)); /* Use set rule is assigned to. */ ti.set = ci->krule->set; if (ci->ctlv != NULL) { ti.tlvs = (void *)(ci->ctlv + 1); ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); } /* Reference all used tables and other objects */ error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti); if (error != 0) goto free; /* * Note that ref_rule_objects() might have updated ci->object_opcodes * to reflect actual number of object opcodes. */ /* Perform rewrite of remaining opcodes */ p = pidx_first; pidx_last = pidx_first + ci->object_opcodes; for (p = pidx_first; p < pidx_last; p++) { cmd = ci->krule->cmd + p->off; update_opcode_kidx(cmd, p->kidx); } free: if (pidx_first != ci->obuf) free(pidx_first, M_IPFW); return (error); } /* * Adds one or more rules to ipfw @chain. * Data layout (version 0)(current): * Request: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3) * ] * Reply: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] * ] * * Rules in reply are modified to store their actual ruleset number. * * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending * according to their idx field and there has to be no duplicates. * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. * (*3) Each ip_fw structure needs to be aligned to u64 boundary. * * Returns 0 on success. */ static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_ctlv *ctlv, *rtlv, *tstate; ipfw_obj_ntlv *ntlv; int clen, error, idx; uint32_t count, read; struct ip_fw_rule *r; struct rule_check_info rci, *ci, *cbuf; int i, rsize; op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); ctlv = (ipfw_obj_ctlv *)(op3 + 1); read = sizeof(ip_fw3_opheader); rtlv = NULL; tstate = NULL; cbuf = NULL; memset(&rci, 0, sizeof(struct rule_check_info)); if (read + sizeof(*ctlv) > sd->valsize) return (EINVAL); if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { clen = ctlv->head.length; /* Check size and alignment */ if (clen > sd->valsize || clen < sizeof(*ctlv)) return (EINVAL); if ((clen % sizeof(uint64_t)) != 0) return (EINVAL); /* * Some table names or other named objects. * Check for validness. */ count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) return (EINVAL); /* * Check each TLV. * Ensure TLVs are sorted ascending and * there are no duplicates. */ idx = -1; ntlv = (ipfw_obj_ntlv *)(ctlv + 1); while (count > 0) { if (ntlv->head.length != sizeof(ipfw_obj_ntlv)) return (EINVAL); error = ipfw_check_object_name_generic(ntlv->name); if (error != 0) return (error); if (ntlv->idx <= idx) return (EINVAL); idx = ntlv->idx; count--; ntlv++; } tstate = ctlv; read += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } if (read + sizeof(*ctlv) > sd->valsize) return (EINVAL); if (ctlv->head.type == IPFW_TLV_RULE_LIST) { clen = ctlv->head.length; if (clen + read > sd->valsize || clen < sizeof(*ctlv)) return (EINVAL); if ((clen % sizeof(uint64_t)) != 0) return (EINVAL); /* * TODO: Permit adding multiple rules at once */ if (ctlv->count != 1) return (ENOTSUP); clen -= sizeof(*ctlv); if (ctlv->count > clen / sizeof(struct ip_fw_rule)) return (EINVAL); /* Allocate state for each rule or use stack */ if (ctlv->count == 1) { memset(&rci, 0, sizeof(struct rule_check_info)); cbuf = &rci; } else cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP, M_WAITOK | M_ZERO); ci = cbuf; /* * Check each rule for validness. * Ensure numbered rules are sorted ascending * and properly aligned */ idx = 0; r = (struct ip_fw_rule *)(ctlv + 1); count = 0; error = 0; while (clen > 0) { rsize = roundup2(RULESIZE(r), sizeof(uint64_t)); if (rsize > clen || ctlv->count <= count) { error = EINVAL; break; } ci->ctlv = tstate; error = check_ipfw_rule1(r, rsize, ci); if (error != 0) break; /* Check sorting */ if (r->rulenum != 0 && r->rulenum < idx) { printf("rulenum %d idx %d\n", r->rulenum, idx); error = EINVAL; break; } idx = r->rulenum; ci->urule = (caddr_t)r; rsize = roundup2(rsize, sizeof(uint64_t)); clen -= rsize; r = (struct ip_fw_rule *)((caddr_t)r + rsize); count++; ci++; } if (ctlv->count != count || error != 0) { if (cbuf != &rci) free(cbuf, M_TEMP); return (EINVAL); } rtlv = ctlv; read += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) { if (cbuf != NULL && cbuf != &rci) free(cbuf, M_TEMP); return (EINVAL); } /* * Passed rules seems to be valid. * Allocate storage and try to add them to chain. */ for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) { clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule); ci->krule = ipfw_alloc_rule(chain, clen); import_rule1(ci); } if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) { /* Free allocate krules */ for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) ipfw_free_rule(ci->krule); } if (cbuf != NULL && cbuf != &rci) free(cbuf, M_TEMP); return (error); } /* * Lists all sopts currently registered. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ] * * Returns 0 on success */ static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; ipfw_sopt_info *i; struct ipfw_sopt_handler *sh; uint32_t count, n, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); CTL3_LOCK(); count = ctl3_hsize; size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_sopt_info); if (size > olh->size) { olh->size = size; CTL3_UNLOCK(); return (ENOMEM); } olh->size = size; for (n = 1; n <= count; n++) { i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i)); KASSERT(i != NULL, ("previously checked buffer is not enough")); sh = &ctl3_handlers[n]; i->opcode = sh->opcode; i->version = sh->version; i->refcnt = sh->refcnt; } CTL3_UNLOCK(); return (0); } /* * Compares two opcodes. * Used both in qsort() and bsearch(). * * Returns 0 if match is found. */ static int compare_opcodes(const void *_a, const void *_b) { const struct opcode_obj_rewrite *a, *b; a = (const struct opcode_obj_rewrite *)_a; b = (const struct opcode_obj_rewrite *)_b; if (a->opcode < b->opcode) return (-1); else if (a->opcode > b->opcode) return (1); return (0); } /* * XXX: Rewrite bsearch() */ static int find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo, struct opcode_obj_rewrite **phi) { struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw; memset(&h, 0, sizeof(h)); h.opcode = op; rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters, ctl3_rsize, sizeof(h), compare_opcodes); if (rw == NULL) return (1); /* Find the first element matching the same opcode */ lo = rw; for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--) ; /* Find the last element matching the same opcode */ hi = rw; ctl3_max = ctl3_rewriters + ctl3_rsize; for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++) ; *plo = lo; *phi = hi; return (0); } /* * Finds opcode object rewriter based on @code. * * Returns pointer to handler or NULL. */ static struct opcode_obj_rewrite * find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { struct opcode_obj_rewrite *rw, *lo, *hi; uint16_t uidx; uint8_t subtype; if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0) return (NULL); for (rw = lo; rw <= hi; rw++) { if (rw->classifier(cmd, &uidx, &subtype) == 0) { if (puidx != NULL) *puidx = uidx; if (ptype != NULL) *ptype = subtype; return (rw); } } return (NULL); } int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx) { if (find_op_rw(cmd, puidx, NULL) == NULL) return (1); return (0); } void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx) { struct opcode_obj_rewrite *rw; rw = find_op_rw(cmd, NULL, NULL); KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode)); rw->update(cmd, idx); } void ipfw_init_obj_rewriter() { ctl3_rewriters = NULL; ctl3_rsize = 0; } void ipfw_destroy_obj_rewriter() { if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = NULL; ctl3_rsize = 0; } /* * Adds one or more opcode object rewrite handlers to the global array. * Function may sleep. */ void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) { size_t sz; struct opcode_obj_rewrite *tmp; CTL3_LOCK(); for (;;) { sz = ctl3_rsize + count; CTL3_UNLOCK(); tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO); CTL3_LOCK(); if (ctl3_rsize + count <= sz) break; /* Retry */ free(tmp, M_IPFW); } /* Merge old & new arrays */ sz = ctl3_rsize + count; memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw)); memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw)); qsort(tmp, sz, sizeof(*rw), compare_opcodes); /* Switch new and free old */ if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = tmp; ctl3_rsize = sz; CTL3_UNLOCK(); } /* * Removes one or more object rewrite handlers from the global array. */ int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) { size_t sz; struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi; int i; CTL3_LOCK(); for (i = 0; i < count; i++) { if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0) continue; for (ktmp = lo; ktmp <= hi; ktmp++) { if (ktmp->classifier != rw[i].classifier) continue; ctl3_max = ctl3_rewriters + ctl3_rsize; sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp); memmove(ktmp, ktmp + 1, sz); ctl3_rsize--; break; } } if (ctl3_rsize == 0) { if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = NULL; } CTL3_UNLOCK(); return (0); } static int export_objhash_ntlv_internal(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct sockopt_data *sd; ipfw_obj_ntlv *ntlv; sd = (struct sockopt_data *)arg; ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ipfw_export_obj_ntlv(no, ntlv); return (0); } /* * Lists all service objects. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ] * Returns 0 on success */ static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *hdr; int count; hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); if (hdr == NULL) return (EINVAL); IPFW_UH_RLOCK(chain); count = ipfw_objhash_count(CHAIN_TO_SRV(chain)); hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv); if (sd->valsize < hdr->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } hdr->count = count; hdr->objsize = sizeof(ipfw_obj_ntlv); if (count > 0) ipfw_objhash_foreach(CHAIN_TO_SRV(chain), export_objhash_ntlv_internal, sd); IPFW_UH_RUNLOCK(chain); return (0); } /* * Compares two sopt handlers (code, version and handler ptr). * Used both as qsort() and bsearch(). * Does not compare handler for latter case. * * Returns 0 if match is found. */ static int compare_sh(const void *_a, const void *_b) { const struct ipfw_sopt_handler *a, *b; a = (const struct ipfw_sopt_handler *)_a; b = (const struct ipfw_sopt_handler *)_b; if (a->opcode < b->opcode) return (-1); else if (a->opcode > b->opcode) return (1); if (a->version < b->version) return (-1); else if (a->version > b->version) return (1); /* bsearch helper */ if (a->handler == NULL) return (0); if ((uintptr_t)a->handler < (uintptr_t)b->handler) return (-1); else if ((uintptr_t)a->handler > (uintptr_t)b->handler) return (1); return (0); } /* * Finds sopt handler based on @code and @version. * * Returns pointer to handler or NULL. */ static struct ipfw_sopt_handler * find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler) { struct ipfw_sopt_handler *sh, h; memset(&h, 0, sizeof(h)); h.opcode = code; h.version = version; h.handler = handler; sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers, ctl3_hsize, sizeof(h), compare_sh); return (sh); } static int find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh) { struct ipfw_sopt_handler *sh; CTL3_LOCK(); if ((sh = find_sh(opcode, version, NULL)) == NULL) { CTL3_UNLOCK(); printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n", opcode, version); return (EINVAL); } sh->refcnt++; ctl3_refct++; /* Copy handler data to requested buffer */ *psh = *sh; CTL3_UNLOCK(); return (0); } static void find_unref_sh(struct ipfw_sopt_handler *psh) { struct ipfw_sopt_handler *sh; CTL3_LOCK(); sh = find_sh(psh->opcode, psh->version, NULL); KASSERT(sh != NULL, ("ctl3 handler disappeared")); sh->refcnt--; ctl3_refct--; CTL3_UNLOCK(); } void ipfw_init_sopt_handler() { CTL3_LOCK_INIT(); IPFW_ADD_SOPT_HANDLER(1, scodes); } void ipfw_destroy_sopt_handler() { IPFW_DEL_SOPT_HANDLER(1, scodes); CTL3_LOCK_DESTROY(); } /* * Adds one or more sockopt handlers to the global array. * Function may sleep. */ void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) { size_t sz; struct ipfw_sopt_handler *tmp; CTL3_LOCK(); for (;;) { sz = ctl3_hsize + count; CTL3_UNLOCK(); tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO); CTL3_LOCK(); if (ctl3_hsize + count <= sz) break; /* Retry */ free(tmp, M_IPFW); } /* Merge old & new arrays */ sz = ctl3_hsize + count; memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh)); memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh)); qsort(tmp, sz, sizeof(*sh), compare_sh); /* Switch new and free old */ if (ctl3_handlers != NULL) free(ctl3_handlers, M_IPFW); ctl3_handlers = tmp; ctl3_hsize = sz; ctl3_gencnt++; CTL3_UNLOCK(); } /* * Removes one or more sockopt handlers from the global array. */ int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) { size_t sz; struct ipfw_sopt_handler *tmp, *h; int i; CTL3_LOCK(); for (i = 0; i < count; i++) { tmp = &sh[i]; h = find_sh(tmp->opcode, tmp->version, tmp->handler); if (h == NULL) continue; sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h); memmove(h, h + 1, sz); ctl3_hsize--; } if (ctl3_hsize == 0) { if (ctl3_handlers != NULL) free(ctl3_handlers, M_IPFW); ctl3_handlers = NULL; } ctl3_gencnt++; CTL3_UNLOCK(); return (0); } /* * Writes data accumulated in @sd to sockopt buffer. * Zeroes internal @sd buffer. */ static int ipfw_flush_sopt_data(struct sockopt_data *sd) { struct sockopt *sopt; int error; size_t sz; sz = sd->koff; if (sz == 0) return (0); sopt = sd->sopt; if (sopt->sopt_dir == SOPT_GET) { error = copyout(sd->kbuf, sopt->sopt_val, sz); if (error != 0) return (error); } memset(sd->kbuf, 0, sd->ksize); sd->ktotal += sz; sd->koff = 0; if (sd->ktotal + sd->ksize < sd->valsize) sd->kavail = sd->ksize; else sd->kavail = sd->valsize - sd->ktotal; /* Update sopt buffer data */ sopt->sopt_valsize = sd->ktotal; sopt->sopt_val = sd->sopt_val + sd->ktotal; return (0); } /* * Ensures that @sd buffer has contiguous @neeeded number of * bytes. * * Returns pointer to requested space or NULL. */ caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed) { int error; caddr_t addr; if (sd->kavail < needed) { /* * Flush data and try another time. */ error = ipfw_flush_sopt_data(sd); if (sd->kavail < needed || error != 0) return (NULL); } addr = sd->kbuf + sd->koff; sd->koff += needed; sd->kavail -= needed; return (addr); } /* * Requests @needed contiguous bytes from @sd buffer. * Function is used to notify subsystem that we are * interesed in first @needed bytes (request header) * and the rest buffer can be safely zeroed. * * Returns pointer to requested space or NULL. */ caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed) { caddr_t addr; if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL) return (NULL); if (sd->kavail > 0) memset(sd->kbuf + sd->koff, 0, sd->kavail); return (addr); } /* * New sockopt handler. */ int ipfw_ctl3(struct sockopt *sopt) { int error, locked; size_t size, valsize; struct ip_fw_chain *chain; char xbuf[256]; struct sockopt_data sdata; struct ipfw_sopt_handler h; ip_fw3_opheader *op3 = NULL; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error != 0) return (error); if (sopt->sopt_name != IP_FW3) return (ipfw_ctl(sopt)); chain = &V_layer3_chain; error = 0; /* Save original valsize before it is altered via sooptcopyin() */ valsize = sopt->sopt_valsize; memset(&sdata, 0, sizeof(sdata)); /* Read op3 header first to determine actual operation */ op3 = (ip_fw3_opheader *)xbuf; error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3)); if (error != 0) return (error); sopt->sopt_valsize = valsize; /* * Find and reference command. */ error = find_ref_sh(op3->opcode, op3->version, &h); if (error != 0) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error != 0) { find_unref_sh(&h); return (error); } } /* * Fill in sockopt_data structure that may be useful for * IP_FW3 get requests. */ locked = 0; if (valsize <= sizeof(xbuf)) { /* use on-stack buffer */ sdata.kbuf = xbuf; sdata.ksize = sizeof(xbuf); sdata.kavail = valsize; } else { /* * Determine opcode type/buffer size: * allocate sliding-window buf for data export or * contiguous buffer for special ops. */ if ((h.dir & HDIR_SET) != 0) { /* Set request. Allocate contigous buffer. */ if (valsize > CTL3_LARGEBUF) { find_unref_sh(&h); return (EFBIG); } size = valsize; } else { /* Get request. Allocate sliding window buffer */ size = (valsizesopt_val, valsize); if (error != 0) return (error); locked = 1; } } sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); sdata.ksize = size; sdata.kavail = size; } sdata.sopt = sopt; sdata.sopt_val = sopt->sopt_val; sdata.valsize = valsize; /* * Copy either all request (if valsize < bsize_max) * or first bsize_max bytes to guarantee most consumers * that all necessary data has been copied). * Anyway, copy not less than sizeof(ip_fw3_opheader). */ if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize, sizeof(ip_fw3_opheader))) != 0) return (error); op3 = (ip_fw3_opheader *)sdata.kbuf; /* Finally, run handler */ error = h.handler(chain, op3, &sdata); find_unref_sh(&h); /* Flush state and free buffers */ if (error == 0) error = ipfw_flush_sopt_data(&sdata); else ipfw_flush_sopt_data(&sdata); if (locked != 0) vsunlock(sdata.sopt_val, valsize); /* Restore original pointer and set number of bytes written */ sopt->sopt_val = sdata.sopt_val; sopt->sopt_valsize = sdata.ktotal; if (sdata.kbuf != xbuf) free(sdata.kbuf, M_TEMP); return (error); } /** * {set|get}sockopt parser. */ int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (512*sizeof(u_int32_t)) int error; size_t size, valsize; struct ip_fw *buf; struct ip_fw_rule0 *rule; struct ip_fw_chain *chain; u_int32_t rulenum[2]; uint32_t opt; struct rule_check_info ci; IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; error = 0; /* Save original valsize before it is altered via sooptcopyin() */ valsize = sopt->sopt_valsize; opt = sopt->sopt_name; /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (opt == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error != 0) return (error); } switch (opt) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ for (;;) { int len = 0, want; size = chain->static_len; size += ipfw_dyn_len(); if (size >= sopt->sopt_valsize) break; buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); IPFW_UH_RLOCK(chain); /* check again how much space we need */ want = chain->static_len + ipfw_dyn_len(); if (size >= want) len = ipfw_getrules(chain, buf, size); IPFW_UH_RUNLOCK(chain); if (size >= want) error = sooptcopyout(sopt, buf, len); free(buf, M_TEMP); if (size >= want) break; } break; case IP_FW_FLUSH: /* locking is done within del_entry() */ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw7) ); memset(&ci, 0, sizeof(struct rule_check_info)); /* * If the size of commands equals RULESIZE7 then we assume * a FreeBSD7.2 binary is talking to us (set is7=1). * is7 is persistent so the next 'ipfw list' command * will use this format. * NOTE: If wrong version is guessed (this can happen if * the first ipfw command is 'ipfw [pipe] list') * the ipfw binary may crash or loop infinitly... */ size = sopt->sopt_valsize; if (size == RULESIZE7(rule)) { is7 = 1; error = convert_rule_to_8(rule); if (error) { free(rule, M_TEMP); return error; } size = RULESIZE(rule); } else is7 = 0; if (error == 0) error = check_ipfw_rule0(rule, size, &ci); if (error == 0) { /* locking is done within add_rule() */ struct ip_fw *krule; krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule)); ci.urule = (caddr_t)rule; ci.krule = krule; import_rule0(&ci); error = commit_rules(chain, &ci, 1); if (error != 0) ipfw_free_rule(ci.krule); else if (sopt->sopt_dir == SOPT_GET) { if (is7) { error = convert_rule_to_7(rule); size = RULESIZE7(rule); if (error) { free(rule, M_TEMP); return error; } } error = sooptcopyout(sopt, rule, size); } } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t) && rulenum[0] != 0) { /* delete or reassign, locking done in del_entry() */ error = del_entry(chain, rulenum[0]); } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ IPFW_UH_WLOCK(chain); V_set_disable = (V_set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, rulenum, sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; } error = zero_entry(chain, rulenum[0], sopt->sopt_name == IP_FW_RESETLOG); break; /*--- TABLE opcodes ---*/ case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: { ipfw_table_entry ent; struct tentry_info tei; struct tid_info ti; struct table_value v; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; memset(&tei, 0, sizeof(tei)); tei.paddr = &ent.addr; tei.subtype = AF_INET; tei.masklen = ent.masklen; ipfw_import_table_value_legacy(ent.value, &v); tei.pvalue = &v; memset(&ti, 0, sizeof(ti)); ti.uidx = ent.tbl; ti.type = IPFW_TABLE_CIDR; error = (opt == IP_FW_TABLE_ADD) ? add_table_entry(chain, &ti, &tei, 0, 1) : del_table_entry(chain, &ti, &tei, 0, 1); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; struct tid_info ti; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; memset(&ti, 0, sizeof(ti)); ti.uidx = tbl; error = flush_table(chain, &ti); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; struct tid_info ti; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; memset(&ti, 0, sizeof(ti)); ti.uidx = tbl; IPFW_RLOCK(chain); error = ipfw_count_table(chain, &ti, &cnt); IPFW_RUNLOCK(chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; struct tid_info ti; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); memset(&ti, 0, sizeof(ti)); ti.uidx = tbl->tbl; IPFW_RLOCK(chain); error = ipfw_dump_table_legacy(chain, &ti, tbl); IPFW_RUNLOCK(chain); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; /*--- NAT operations are protected by the IPFW_LOCK ---*/ case IP_FW_NAT_CFG: if (IPFW_NAT_LOADED) error = ipfw_nat_cfg_ptr(sopt); else { printf("IP_FW_NAT_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_DEL: if (IPFW_NAT_LOADED) error = ipfw_nat_del_ptr(sopt); else { printf("IP_FW_NAT_DEL: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_CONFIG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_cfg_ptr(sopt); else { printf("IP_FW_NAT_GET_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_LOG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_log_ptr(sopt); else { printf("IP_FW_NAT_GET_LOG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } #define RULE_MAXSIZE (256*sizeof(u_int32_t)) /* Functions to convert rules 7.2 <==> 8.0 */ static int convert_rule_to_7(struct ip_fw_rule0 *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; /* copy of original rule, version 8 */ struct ip_fw_rule0 *tmp; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule, tmp, RULE_MAXSIZE); /* Copy fields */ //rule7->_pad = tmp->_pad; rule7->set = tmp->set; rule7->rulenum = tmp->rulenum; rule7->cmd_len = tmp->cmd_len; rule7->act_ofs = tmp->act_ofs; rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; rule7->cmd_len = tmp->cmd_len; rule7->pcnt = tmp->pcnt; rule7->bcnt = tmp->bcnt; rule7->timestamp = tmp->timestamp; /* Copy commands */ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * decrement opcode if it is after O_REASS */ dst->opcode--; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } free(tmp, M_TEMP); return 0; } static int convert_rule_to_8(struct ip_fw_rule0 *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; /* Copy of original rule */ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule7, tmp, RULE_MAXSIZE); for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * increment opcode if it is after O_REASS */ dst->opcode++; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } rule->_pad = tmp->_pad; rule->set = tmp->set; rule->rulenum = tmp->rulenum; rule->cmd_len = tmp->cmd_len; rule->act_ofs = tmp->act_ofs; rule->next_rule = (struct ip_fw *)tmp->next_rule; rule->cmd_len = tmp->cmd_len; rule->id = 0; /* XXX see if is ok = 0 */ rule->pcnt = tmp->pcnt; rule->bcnt = tmp->bcnt; rule->timestamp = tmp->timestamp; free (tmp, M_TEMP); return 0; } /* * Named object api * */ void ipfw_init_srv(struct ip_fw_chain *ch) { ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT); ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT, M_IPFW, M_WAITOK | M_ZERO); } void ipfw_destroy_srv(struct ip_fw_chain *ch) { free(ch->srvstate, M_IPFW); ipfw_objhash_destroy(ch->srvmap); } /* * Allocate new bitmask which can be used to enlarge/shrink * named instance index. */ void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks) { size_t size; int max_blocks; u_long *idx_mask; KASSERT((items % BLOCK_ITEMS) == 0, ("bitmask size needs to power of 2 and greater or equal to %zu", BLOCK_ITEMS)); max_blocks = items / BLOCK_ITEMS; size = items / 8; idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK); /* Mark all as free */ memset(idx_mask, 0xFF, size * IPFW_MAX_SETS); *idx_mask &= ~(u_long)1; /* Skip index 0 */ *idx = idx_mask; *pblocks = max_blocks; } /* * Copy current bitmask index to new one. */ void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks) { int old_blocks, new_blocks; u_long *old_idx, *new_idx; int i; old_idx = ni->idx_mask; old_blocks = ni->max_blocks; new_idx = *idx; new_blocks = *blocks; for (i = 0; i < IPFW_MAX_SETS; i++) { memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i], old_blocks * sizeof(u_long)); } } /* * Swaps current @ni index with new one. */ void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks) { int old_blocks; u_long *old_idx; old_idx = ni->idx_mask; old_blocks = ni->max_blocks; ni->idx_mask = *idx; ni->max_blocks = *blocks; /* Save old values */ *idx = old_idx; *blocks = old_blocks; } void ipfw_objhash_bitmap_free(void *idx, int blocks) { free(idx, M_IPFW); } /* * Creates named hash instance. * Must be called without holding any locks. * Return pointer to new instance. */ struct namedobj_instance * ipfw_objhash_create(uint32_t items) { struct namedobj_instance *ni; int i; size_t size; size = sizeof(struct namedobj_instance) + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE + sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE; ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO); ni->nn_size = NAMEDOBJ_HASH_SIZE; ni->nv_size = NAMEDOBJ_HASH_SIZE; ni->names = (struct namedobjects_head *)(ni +1); ni->values = &ni->names[ni->nn_size]; for (i = 0; i < ni->nn_size; i++) TAILQ_INIT(&ni->names[i]); for (i = 0; i < ni->nv_size; i++) TAILQ_INIT(&ni->values[i]); /* Set default hashing/comparison functions */ ni->hash_f = objhash_hash_name; ni->cmp_f = objhash_cmp_name; /* Allocate bitmask separately due to possible resize */ ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks); return (ni); } void ipfw_objhash_destroy(struct namedobj_instance *ni) { free(ni->idx_mask, M_IPFW); free(ni, M_IPFW); } void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f) { ni->hash_f = hash_f; ni->cmp_f = cmp_f; } static uint32_t objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set) { return (fnv_32_str((const char *)name, FNV1_32_INIT)); } static int objhash_cmp_name(struct named_object *no, const void *name, uint32_t set) { if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set)) return (0); return (1); } static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val) { uint32_t v; v = val % (ni->nv_size - 1); return (v); } struct named_object * ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name) { struct named_object *no; uint32_t hash; hash = ni->hash_f(ni, name, set) % ni->nn_size; TAILQ_FOREACH(no, &ni->names[hash], nn_next) { if (ni->cmp_f(no, name, set) == 0) return (no); } return (NULL); } /* * Find named object by @uid. * Check @tlvs for valid data inside. * * Returns pointer to found TLV or NULL. */ ipfw_obj_ntlv * ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv) { ipfw_obj_ntlv *ntlv; uintptr_t pa, pe; int l; pa = (uintptr_t)tlvs; pe = pa + len; l = 0; for (; pa < pe; pa += l) { ntlv = (ipfw_obj_ntlv *)pa; l = ntlv->head.length; if (l != sizeof(*ntlv)) return (NULL); if (ntlv->idx != uidx) continue; /* * When userland has specified zero TLV type, do * not compare it with eltv. In some cases userland * doesn't know what type should it have. Use only * uidx and name for search named_object. */ if (ntlv->head.type != 0 && ntlv->head.type != (uint16_t)etlv) continue; if (ipfw_check_object_name_generic(ntlv->name) != 0) return (NULL); return (ntlv); } return (NULL); } /* * Finds object config based on either legacy index * or name in ntlv. * Note @ti structure contains unchecked data from userland. * * Returns 0 in success and fills in @pno with found config */ int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, uint32_t etlv, struct named_object **pno) { char *name; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs == NULL) return (EINVAL); ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv); if (ntlv == NULL) return (EINVAL); name = ntlv->name; /* * Use set provided by @ti instead of @ntlv one. * This is needed due to different sets behavior * controlled by V_fw_tables_sets. */ set = ti->set; *pno = ipfw_objhash_lookup_name(ni, set, name); if (*pno == NULL) return (ESRCH); return (0); } /* * Find named object by name, considering also its TLV type. */ struct named_object * ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, uint32_t type, const char *name) { struct named_object *no; uint32_t hash; hash = ni->hash_f(ni, name, set) % ni->nn_size; TAILQ_FOREACH(no, &ni->names[hash], nn_next) { if (ni->cmp_f(no, name, set) == 0 && no->etlv == (uint16_t)type) return (no); } return (NULL); } struct named_object * ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) { struct named_object *no; uint32_t hash; hash = objhash_hash_idx(ni, kidx); TAILQ_FOREACH(no, &ni->values[hash], nv_next) { if (no->kidx == kidx) return (no); } return (NULL); } int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b) { if ((strcmp(a->name, b->name) == 0) && a->set == b->set) return (1); return (0); } void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no) { uint32_t hash; hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next); hash = objhash_hash_idx(ni, no->kidx); TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next); ni->count++; } void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no) { uint32_t hash; hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; TAILQ_REMOVE(&ni->names[hash], no, nn_next); hash = objhash_hash_idx(ni, no->kidx); TAILQ_REMOVE(&ni->values[hash], no, nv_next); ni->count--; } uint32_t ipfw_objhash_count(struct namedobj_instance *ni) { return (ni->count); } uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type) { struct named_object *no; uint32_t count; int i; count = 0; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH(no, &ni->names[i], nn_next) { if (no->etlv == type) count++; } } return (count); } /* * Runs @func for each found named object. * It is safe to delete objects from callback */ int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg) { struct named_object *no, *no_tmp; int i, ret; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { ret = f(ni, no, arg); if (ret != 0) return (ret); } } return (0); } /* * Runs @f for each found named object with type @type. * It is safe to delete objects from callback */ int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, void *arg, uint16_t type) { struct named_object *no, *no_tmp; int i, ret; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { if (no->etlv != type) continue; ret = f(ni, no, arg); if (ret != 0) return (ret); } } return (0); } /* * Removes index from given set. * Returns 0 on success. */ int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx) { u_long *mask; int i, v; i = idx / BLOCK_ITEMS; v = idx % BLOCK_ITEMS; if (i >= ni->max_blocks) return (1); mask = &ni->idx_mask[i]; if ((*mask & ((u_long)1 << v)) != 0) return (1); /* Mark as free */ *mask |= (u_long)1 << v; /* Update free offset */ if (ni->free_off[0] > i) ni->free_off[0] = i; return (0); } /* * Allocate new index in given instance and stores in in @pidx. * Returns 0 on success. */ int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx) { struct namedobj_instance *ni; u_long *mask; int i, off, v; ni = (struct namedobj_instance *)n; off = ni->free_off[0]; mask = &ni->idx_mask[off]; for (i = off; i < ni->max_blocks; i++, mask++) { if ((v = ffsl(*mask)) == 0) continue; /* Mark as busy */ *mask &= ~ ((u_long)1 << (v - 1)); ni->free_off[0] = i; v = BLOCK_ITEMS * i + v - 1; *pidx = v; return (0); } return (1); } /* end of file */ Index: projects/fuse2/sys/netpfil/pf/pf.c =================================================================== --- projects/fuse2/sys/netpfil/pf/pf.c (revision 350434) +++ projects/fuse2/sys/netpfil/pf/pf.c (revision 350435) @@ -1,6706 +1,6704 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002 - 2008 Henning Brauer * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_bpf.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif /* INET6 */ #include #include #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x /* * Global variables */ /* state tables */ VNET_DEFINE(struct pf_altqqueue, pf_altqs[4]); VNET_DEFINE(struct pf_palist, pf_pabuf); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_inactive); VNET_DEFINE(struct pf_kstatus, pf_status); VNET_DEFINE(u_int32_t, ticket_altqs_active); VNET_DEFINE(u_int32_t, ticket_altqs_inactive); VNET_DEFINE(int, altqs_inactive_open); VNET_DEFINE(u_int32_t, ticket_pabuf); VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); #define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) VNET_DEFINE(u_char, pf_tcp_secret[16]); #define V_pf_tcp_secret VNET(pf_tcp_secret) VNET_DEFINE(int, pf_tcp_secret_init); #define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) VNET_DEFINE(int, pf_tcp_iss_off); #define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) VNET_DECLARE(int, pf_vnet_active); #define V_pf_vnet_active VNET(pf_vnet_active) VNET_DEFINE_STATIC(uint32_t, pf_purge_idx); #define V_pf_purge_idx VNET(pf_purge_idx) /* * Queue for pf_intr() sends. */ static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); struct pf_send_entry { STAILQ_ENTRY(pf_send_entry) pfse_next; struct mbuf *pfse_m; enum { PFSE_IP, PFSE_IP6, PFSE_ICMP, PFSE_ICMP6, } pfse_type; struct { int type; int code; int mtu; } icmpopts; }; STAILQ_HEAD(pf_send_head, pf_send_entry); VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue); #define V_pf_sendqueue VNET(pf_sendqueue) static struct mtx pf_sendqueue_mtx; MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF); #define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) #define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) /* * Queue for pf_overload_task() tasks. */ struct pf_overload_entry { SLIST_ENTRY(pf_overload_entry) next; struct pf_addr addr; sa_family_t af; uint8_t dir; struct pf_rule *rule; }; SLIST_HEAD(pf_overload_head, pf_overload_entry); VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue); #define V_pf_overloadqueue VNET(pf_overloadqueue) VNET_DEFINE_STATIC(struct task, pf_overloadtask); #define V_pf_overloadtask VNET(pf_overloadtask) static struct mtx pf_overloadqueue_mtx; MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx, "pf overload/flush queue", MTX_DEF); #define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx) #define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx) VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules); struct mtx pf_unlnkdrules_mtx; MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules", MTX_DEF); VNET_DEFINE_STATIC(uma_zone_t, pf_sources_z); #define V_pf_sources_z VNET(pf_sources_z) uma_zone_t pf_mtag_z; VNET_DEFINE(uma_zone_t, pf_state_z); VNET_DEFINE(uma_zone_t, pf_state_key_z); VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]); #define PFID_CPUBITS 8 #define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS) #define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT) #define PFID_MAXID (~PFID_CPUMASK) CTASSERT((1 << PFID_CPUBITS) >= MAXCPU); static void pf_src_tree_remove_state(struct pf_state *); static void pf_init_threshold(struct pf_threshold *, u_int32_t, u_int32_t); static void pf_add_threshold(struct pf_threshold *); static int pf_check_threshold(struct pf_threshold *); static void pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *, u_int16_t *, u_int16_t *, struct pf_addr *, u_int16_t, u_int8_t, sa_family_t); static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *); static void pf_change_icmp(struct pf_addr *, u_int16_t *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t *, u_int16_t *, u_int16_t *, u_int16_t *, u_int8_t, sa_family_t); static void pf_send_tcp(struct mbuf *, const struct pf_rule *, sa_family_t, const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, u_int16_t, struct ifnet *); static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_rule *); static void pf_detach_state(struct pf_state *); static int pf_state_key_attach(struct pf_state_key *, struct pf_state_key *, struct pf_state *); static void pf_state_key_detach(struct pf_state *, int); static int pf_state_key_ctor(void *, int, void *, int); static u_int32_t pf_tcp_iss(struct pf_pdesc *); static int pf_test_rule(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct inpcb *); static int pf_create_state(struct pf_rule *, struct pf_rule *, struct pf_rule *, struct pf_pdesc *, struct pf_src_node *, struct pf_state_key *, struct pf_state_key *, struct mbuf *, int, u_int16_t, u_int16_t, int *, struct pfi_kif *, struct pf_state **, int, u_int16_t, u_int16_t, int); static int pf_test_fragment(struct pf_rule **, int, struct pfi_kif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **); static int pf_tcp_track_full(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pfi_kif *, struct mbuf *, int, struct pf_pdesc *, u_short *, int *); static int pf_tcp_track_sloppy(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pf_pdesc *, u_short *); static int pf_test_state_tcp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_udp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *); static int pf_test_state_icmp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_other(struct pf_state **, int, struct pfi_kif *, struct mbuf *, struct pf_pdesc *); static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, int, u_int16_t); static int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); static void pf_print_state_parts(struct pf_state *, struct pf_state_key *, struct pf_state_key *); static int pf_addr_wrap_neq(struct pf_addr_wrap *, struct pf_addr_wrap *); static struct pf_state *pf_find_state(struct pfi_kif *, struct pf_state_key_cmp *, u_int); static int pf_src_connlimit(struct pf_state **); static void pf_overload_task(void *v, int pending); static int pf_insert_src_node(struct pf_src_node **, struct pf_rule *, struct pf_addr *, sa_family_t); static u_int pf_purge_expired_states(u_int, int); static void pf_purge_unlinked_rules(void); static int pf_mtag_uminit(void *, int, int); static void pf_mtag_free(struct m_tag *); #ifdef INET static void pf_route(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *, struct inpcb *); #endif /* INET */ #ifdef INET6 static void pf_change_a6(struct pf_addr *, u_int16_t *, struct pf_addr *, u_int8_t); static void pf_route6(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *, struct inpcb *); #endif /* INET6 */ int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); extern int pf_end_threads; extern struct proc *pf_purge_proc; VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ (pd)->pf_mtag->flags & PF_PACKET_LOOPED) #define STATE_LOOKUP(i, k, d, s, pd) \ do { \ (s) = pf_find_state((i), (k), (d)); \ if ((s) == NULL) \ return (PF_DROP); \ if (PACKET_LOOPED(pd)) \ return (PF_PASS); \ if ((d) == PF_OUT && \ (((s)->rule.ptr->rt == PF_ROUTETO && \ (s)->rule.ptr->direction == PF_OUT) || \ ((s)->rule.ptr->rt == PF_REPLYTO && \ (s)->rule.ptr->direction == PF_IN)) && \ (s)->rt_kif != NULL && \ (s)->rt_kif != (i)) \ return (PF_PASS); \ } while (0) #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all #define STATE_INC_COUNTERS(s) \ do { \ counter_u64_add(s->rule.ptr->states_cur, 1); \ counter_u64_add(s->rule.ptr->states_tot, 1); \ if (s->anchor.ptr != NULL) { \ counter_u64_add(s->anchor.ptr->states_cur, 1); \ counter_u64_add(s->anchor.ptr->states_tot, 1); \ } \ if (s->nat_rule.ptr != NULL) { \ counter_u64_add(s->nat_rule.ptr->states_cur, 1);\ counter_u64_add(s->nat_rule.ptr->states_tot, 1);\ } \ } while (0) #define STATE_DEC_COUNTERS(s) \ do { \ if (s->nat_rule.ptr != NULL) \ counter_u64_add(s->nat_rule.ptr->states_cur, -1);\ if (s->anchor.ptr != NULL) \ counter_u64_add(s->anchor.ptr->states_cur, -1); \ counter_u64_add(s->rule.ptr->states_cur, -1); \ } while (0) MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(struct pf_srchash *, pf_srchash); SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)"); u_long pf_hashmask; u_long pf_srchashmask; static u_long pf_hashsize; static u_long pf_srchashsize; u_long pf_ioctl_maxcount = 65535; SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, &pf_hashsize, 0, "Size of pf(4) states hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable"); SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RDTUN, &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call"); VNET_DEFINE(void *, pf_swi_cookie); VNET_DEFINE(uint32_t, pf_hashseed); #define V_pf_hashseed VNET(pf_hashseed) int pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (a->addr32[3] > b->addr32[3]) return (1); if (a->addr32[3] < b->addr32[3]) return (-1); if (a->addr32[2] > b->addr32[2]) return (1); if (a->addr32[2] < b->addr32[2]) return (-1); if (a->addr32[1] > b->addr32[1]) return (1); if (a->addr32[1] < b->addr32[1]) return (-1); if (a->addr32[0] > b->addr32[0]) return (1); if (a->addr32[0] < b->addr32[0]) return (-1); break; #endif /* INET6 */ default: panic("%s: unknown address family %u", __func__, af); } return (0); } static __inline uint32_t pf_hashkey(struct pf_state_key *sk) { uint32_t h; h = murmur3_32_hash32((uint32_t *)sk, sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), V_pf_hashseed); return (h & pf_hashmask); } static __inline uint32_t pf_hashsrc(struct pf_addr *addr, sa_family_t af) { uint32_t h; switch (af) { case AF_INET: h = murmur3_32_hash32((uint32_t *)&addr->v4, sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed); break; case AF_INET6: h = murmur3_32_hash32((uint32_t *)&addr->v6, sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed); break; default: panic("%s: unknown address family %u", __func__, af); } return (h & pf_srchashmask); } #ifdef ALTQ static int pf_state_hash(struct pf_state *s) { u_int32_t hv = (intptr_t)s / sizeof(*s); hv ^= crc32(&s->src, sizeof(s->src)); hv ^= crc32(&s->dst, sizeof(s->dst)); if (hv == 0) hv = 1; return (hv); } #endif #ifdef INET6 void pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: dst->addr32[0] = src->addr32[0]; break; #endif /* INET */ case AF_INET6: dst->addr32[0] = src->addr32[0]; dst->addr32[1] = src->addr32[1]; dst->addr32[2] = src->addr32[2]; dst->addr32[3] = src->addr32[3]; break; } } #endif /* INET6 */ static void pf_init_threshold(struct pf_threshold *threshold, u_int32_t limit, u_int32_t seconds) { threshold->limit = limit * PF_THRESHOLD_MULT; threshold->seconds = seconds; threshold->count = 0; threshold->last = time_uptime; } static void pf_add_threshold(struct pf_threshold *threshold) { u_int32_t t = time_uptime, diff = t - threshold->last; if (diff >= threshold->seconds) threshold->count = 0; else threshold->count -= threshold->count * diff / threshold->seconds; threshold->count += PF_THRESHOLD_MULT; threshold->last = t; } static int pf_check_threshold(struct pf_threshold *threshold) { return (threshold->count > threshold->limit); } static int pf_src_connlimit(struct pf_state **state) { struct pf_overload_entry *pfoe; int bad = 0; PF_STATE_LOCK_ASSERT(*state); (*state)->src_node->conn++; (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); if ((*state)->rule.ptr->max_src_conn && (*state)->rule.ptr->max_src_conn < (*state)->src_node->conn) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1); bad++; } if ((*state)->rule.ptr->max_src_conn_rate.limit && pf_check_threshold(&(*state)->src_node->conn_rate)) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1); bad++; } if (!bad) return (0); /* Kill this state. */ (*state)->timeout = PFTM_PURGE; (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; if ((*state)->rule.ptr->overload_tbl == NULL) return (1); /* Schedule overloading and flushing task. */ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); if (pfoe == NULL) return (1); /* too bad :( */ bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); pfoe->af = (*state)->key[PF_SK_WIRE]->af; pfoe->rule = (*state)->rule.ptr; pfoe->dir = (*state)->direction; PF_OVERLOADQ_LOCK(); SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); PF_OVERLOADQ_UNLOCK(); taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); return (1); } static void pf_overload_task(void *v, int pending) { struct pf_overload_head queue; struct pfr_addr p; struct pf_overload_entry *pfoe, *pfoe1; uint32_t killed = 0; CURVNET_SET((struct vnet *)v); PF_OVERLOADQ_LOCK(); queue = V_pf_overloadqueue; SLIST_INIT(&V_pf_overloadqueue); PF_OVERLOADQ_UNLOCK(); bzero(&p, sizeof(p)); SLIST_FOREACH(pfoe, &queue, next) { counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("%s: blocking address ", __func__); pf_print_host(&pfoe->addr, 0, pfoe->af); printf("\n"); } p.pfra_af = pfoe->af; switch (pfoe->af) { #ifdef INET case AF_INET: p.pfra_net = 32; p.pfra_ip4addr = pfoe->addr.v4; break; #endif #ifdef INET6 case AF_INET6: p.pfra_net = 128; p.pfra_ip6addr = pfoe->addr.v6; break; #endif } PF_RULES_WLOCK(); pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second); PF_RULES_WUNLOCK(); } /* * Remove those entries, that don't need flushing. */ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) if (pfoe->rule->flush == 0) { SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next); free(pfoe, M_PFTEMP); } else counter_u64_add( V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1); /* If nothing to flush, return. */ if (SLIST_EMPTY(&queue)) { CURVNET_RESTORE(); return; } for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state_key *sk; struct pf_state *s; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { sk = s->key[PF_SK_WIRE]; SLIST_FOREACH(pfoe, &queue, next) if (sk->af == pfoe->af && ((pfoe->rule->flush & PF_FLUSH_GLOBAL) || pfoe->rule == s->rule.ptr) && ((pfoe->dir == PF_OUT && PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) || (pfoe->dir == PF_IN && PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) { s->timeout = PFTM_PURGE; s->src.state = s->dst.state = TCPS_CLOSED; killed++; } } PF_HASHROW_UNLOCK(ih); } SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) free(pfoe, M_PFTEMP); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: %u states killed", __func__, killed); CURVNET_RESTORE(); } /* * Can return locked on failure, so that we can consistently * allocate and insert a new one. */ struct pf_src_node * pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af, int returnlocked) { struct pf_srchash *sh; struct pf_src_node *n; counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1); sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_LOCK(sh); LIST_FOREACH(n, &sh->nodes, entry) if (n->rule.ptr == rule && n->af == af && ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) break; if (n != NULL) { n->states++; PF_HASHROW_UNLOCK(sh); } else if (returnlocked == 0) PF_HASHROW_UNLOCK(sh); return (n); } static int pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, struct pf_addr *src, sa_family_t af) { KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR), ("%s for non-tracking rule %p", __func__, rule)); if (*sn == NULL) *sn = pf_find_src_node(src, rule, af, 1); if (*sn == NULL) { struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_ASSERT(sh); if (!rule->max_src_nodes || counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes) (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); else counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1); if ((*sn) == NULL) { PF_HASHROW_UNLOCK(sh); return (-1); } pf_init_threshold(&(*sn)->conn_rate, rule->max_src_conn_rate.limit, rule->max_src_conn_rate.seconds); (*sn)->af = af; (*sn)->rule.ptr = rule; PF_ACPY(&(*sn)->addr, src, af); LIST_INSERT_HEAD(&sh->nodes, *sn, entry); (*sn)->creation = time_uptime; (*sn)->ruletype = rule->action; (*sn)->states = 1; if ((*sn)->rule.ptr != NULL) counter_u64_add((*sn)->rule.ptr->src_nodes, 1); PF_HASHROW_UNLOCK(sh); counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1); } else { if (rule->max_src_states && (*sn)->states >= rule->max_src_states) { counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES], 1); return (-1); } } return (0); } void pf_unlink_src_node(struct pf_src_node *src) { PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]); LIST_REMOVE(src, entry); if (src->rule.ptr) counter_u64_add(src->rule.ptr->src_nodes, -1); } u_int pf_free_src_nodes(struct pf_src_node_list *head) { struct pf_src_node *sn, *tmp; u_int count = 0; LIST_FOREACH_SAFE(sn, head, entry, tmp) { uma_zfree(V_pf_sources_z, sn); count++; } counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count); return (count); } void pf_mtag_initialize() { pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL, UMA_ALIGN_PTR, 0); } /* Per-vnet data storage structures initialization. */ void pf_initialize() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; u_int i; if (pf_hashsize == 0 || !powerof2(pf_hashsize)) pf_hashsize = PF_HASHSIZ; if (pf_srchashsize == 0 || !powerof2(pf_srchashsize)) pf_srchashsize = PF_SRCHASHSIZ; V_pf_hashseed = arc4random(); /* States and state keys storage. */ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); uma_zone_set_warning(V_pf_state_z, "PF states limit reached"); V_pf_state_key_z = uma_zcreate("pf state keys", sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_NOWAIT | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_keyhash == NULL || V_pf_idhash == NULL) { printf("pf: Unable to allocate memory for " "state_hashsize %lu.\n", pf_hashsize); free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); pf_hashsize = PF_HASHSIZ; V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO); V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO); } pf_hashmask = pf_hashsize - 1; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); } /* Source nodes. */ V_pf_sources_z = uma_zcreate("pf source nodes", sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached"); V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO); if (V_pf_srchash == NULL) { printf("pf: Unable to allocate memory for " "source_hashsize %lu.\n", pf_srchashsize); pf_srchashsize = PF_SRCHASHSIZ; V_pf_srchash = mallocarray(pf_srchashsize, sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO); } pf_srchashmask = pf_srchashsize - 1; for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); TAILQ_INIT(&V_pf_altqs[2]); TAILQ_INIT(&V_pf_altqs[3]); TAILQ_INIT(&V_pf_pabuf); V_pf_altqs_active = &V_pf_altqs[0]; V_pf_altq_ifs_active = &V_pf_altqs[1]; V_pf_altqs_inactive = &V_pf_altqs[2]; V_pf_altq_ifs_inactive = &V_pf_altqs[3]; /* Send & overload+flush queues. */ STAILQ_INIT(&V_pf_sendqueue); SLIST_INIT(&V_pf_overloadqueue); TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet); /* Unlinked, but may be referenced rules. */ TAILQ_INIT(&V_pf_unlinked_rules); } void pf_mtag_cleanup() { uma_zdestroy(pf_mtag_z); } void pf_cleanup() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; struct pf_send_entry *pfse, *next; u_int i; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask; i++, kh++, ih++) { KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", __func__)); KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", __func__)); mtx_destroy(&kh->lock); mtx_destroy(&ih->lock); } free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { KASSERT(LIST_EMPTY(&sh->nodes), ("%s: source node hash not empty", __func__)); mtx_destroy(&sh->lock); } free(V_pf_srchash, M_PFHASH); STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { m_freem(pfse->pfse_m); free(pfse, M_PFTEMP); } uma_zdestroy(V_pf_sources_z); uma_zdestroy(V_pf_state_z); uma_zdestroy(V_pf_state_key_z); } static int pf_mtag_uminit(void *mem, int size, int how) { struct m_tag *t; t = (struct m_tag *)mem; t->m_tag_cookie = MTAG_ABI_COMPAT; t->m_tag_id = PACKET_TAG_PF; t->m_tag_len = sizeof(struct pf_mtag); t->m_tag_free = pf_mtag_free; return (0); } static void pf_mtag_free(struct m_tag *t) { uma_zfree(pf_mtag_z, t); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) return ((struct pf_mtag *)(mtag + 1)); mtag = uma_zalloc(pf_mtag_z, M_NOWAIT); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof(struct pf_mtag)); m_tag_prepend(m, mtag); return ((struct pf_mtag *)(mtag + 1)); } static int pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_keyhash *khs, *khw, *kh; struct pf_state_key *sk, *cur; struct pf_state *si, *olds = NULL; int idx; KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); /* * We need to lock hash slots of both keys. To avoid deadlock * we always lock the slot with lower address first. Unlock order * isn't important. * * We also need to lock ID hash slot before dropping key * locks. On success we return with ID hash slot locked. */ if (skw == sks) { khs = khw = &V_pf_keyhash[pf_hashkey(skw)]; PF_HASHROW_LOCK(khs); } else { khs = &V_pf_keyhash[pf_hashkey(sks)]; khw = &V_pf_keyhash[pf_hashkey(skw)]; if (khs == khw) { PF_HASHROW_LOCK(khs); } else if (khs < khw) { PF_HASHROW_LOCK(khs); PF_HASHROW_LOCK(khw); } else { PF_HASHROW_LOCK(khw); PF_HASHROW_LOCK(khs); } } #define KEYS_UNLOCK() do { \ if (khs != khw) { \ PF_HASHROW_UNLOCK(khs); \ PF_HASHROW_UNLOCK(khw); \ } else \ PF_HASHROW_UNLOCK(khs); \ } while (0) /* * First run: start with wire key. */ sk = skw; kh = khw; idx = PF_SK_WIRE; keyattach: LIST_FOREACH(cur, &kh->keys, entry) if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) break; if (cur != NULL) { /* Key exists. Check for same kif, if none, add to key. */ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; PF_HASHROW_LOCK(ih); if (si->kif == s->kif && si->direction == s->direction) { if (sk->proto == IPPROTO_TCP && si->src.state >= TCPS_FIN_WAIT_2 && si->dst.state >= TCPS_FIN_WAIT_2) { /* * New state matches an old >FIN_WAIT_2 * state. We can't drop key hash locks, * thus we can't unlink it properly. * * As a workaround we drop it into * TCPS_CLOSED state, schedule purge * ASAP and push it into the very end * of the slot TAILQ, so that it won't * conflict with our new state. */ si->src.state = si->dst.state = TCPS_CLOSED; si->timeout = PFTM_PURGE; olds = si; } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: %s key attach " "failed on %s: ", (idx == PF_SK_WIRE) ? "wire" : "stack", s->kif->pfik_name); pf_print_state_parts(s, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf(", existing: "); pf_print_state_parts(si, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf("\n"); } PF_HASHROW_UNLOCK(ih); KEYS_UNLOCK(); uma_zfree(V_pf_state_key_z, sk); if (idx == PF_SK_STACK) pf_detach_state(s); return (EEXIST); /* collision! */ } } PF_HASHROW_UNLOCK(ih); } uma_zfree(V_pf_state_key_z, sk); s->key[idx] = cur; } else { LIST_INSERT_HEAD(&kh->keys, sk, entry); s->key[idx] = sk; } stateattach: /* List is sorted, if-bound states before floating. */ if (s->kif == V_pfi_all) TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); else TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); if (olds) { TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]); TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds, key_list[idx]); olds = NULL; } /* * Attach done. See how should we (or should not?) * attach a second key. */ if (sks == skw) { s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; idx = PF_SK_STACK; sks = NULL; goto stateattach; } else if (sks != NULL) { /* * Continue attaching with stack key. */ sk = sks; kh = khs; idx = PF_SK_STACK; sks = NULL; goto keyattach; } PF_STATE_LOCK(s); KEYS_UNLOCK(); KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, ("%s failure", __func__)); return (0); #undef KEYS_UNLOCK } static void pf_detach_state(struct pf_state *s) { struct pf_state_key *sks = s->key[PF_SK_STACK]; struct pf_keyhash *kh; if (sks != NULL) { kh = &V_pf_keyhash[pf_hashkey(sks)]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_STACK] != NULL) pf_state_key_detach(s, PF_SK_STACK); /* * If both point to same key, then we are done. */ if (sks == s->key[PF_SK_WIRE]) { pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); return; } PF_HASHROW_UNLOCK(kh); } if (s->key[PF_SK_WIRE] != NULL) { kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_WIRE] != NULL) pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); } } static void pf_state_key_detach(struct pf_state *s, int idx) { struct pf_state_key *sk = s->key[idx]; #ifdef INVARIANTS struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; PF_HASHROW_ASSERT(kh); #endif TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); s->key[idx] = NULL; if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { LIST_REMOVE(sk, entry); uma_zfree(V_pf_state_key_z, sk); } } static int pf_state_key_ctor(void *mem, int size, void *arg, int flags) { struct pf_state_key *sk = mem; bzero(sk, sizeof(struct pf_state_key_cmp)); TAILQ_INIT(&sk->states[PF_SK_WIRE]); TAILQ_INIT(&sk->states[PF_SK_STACK]); return (0); } struct pf_state_key * pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); sk->port[pd->sidx] = sport; sk->port[pd->didx] = dport; sk->proto = pd->proto; sk->af = pd->af; return (sk); } struct pf_state_key * pf_state_key_clone(struct pf_state_key *orig) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); return (sk); } int pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_idhash *ih; struct pf_state *cur; int error; KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), ("%s: sks not pristine", __func__)); KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), ("%s: skw not pristine", __func__)); KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); s->kif = kif; if (s->id == 0 && s->creatorid == 0) { /* XXX: should be atomic, but probability of collision low */ if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID) V_pf_stateid[curcpu] = 1; s->id |= (uint64_t )curcpu << PFID_CPUSHIFT; s->id = htobe64(s->id); s->creatorid = V_pf_status.hostid; } /* Returns with ID locked on success. */ if ((error = pf_state_key_attach(skw, sks, s)) != 0) return (error); ih = &V_pf_idhash[PF_IDHASH(s)]; PF_HASHROW_ASSERT(ih); LIST_FOREACH(cur, &ih->states, entry) if (cur->id == s->id && cur->creatorid == s->creatorid) break; if (cur != NULL) { PF_HASHROW_UNLOCK(ih); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state ID collision: " "id: %016llx creatorid: %08x\n", (unsigned long long)be64toh(s->id), ntohl(s->creatorid)); } pf_detach_state(s); return (EEXIST); } LIST_INSERT_HEAD(&ih->states, s, entry); /* One for keys, one for ID hash. */ refcount_init(&s->refs, 2); counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1); if (V_pfsync_insert_state_ptr != NULL) V_pfsync_insert_state_ptr(s); /* Returns locked. */ return (0); } /* * Find state by ID: returns with locked row on success. */ struct pf_state * pf_find_state_byid(uint64_t id, uint32_t creatorid) { struct pf_idhash *ih; struct pf_state *s; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))]; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) if (s->id == id && s->creatorid == creatorid) break; if (s == NULL) PF_HASHROW_UNLOCK(ih); return (s); } /* * Find state by key. * Returns with ID hash slot locked on success. */ static struct pf_state * pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s; int idx; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); /* List is sorted, if-bound states before floating ones. */ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) if (s->kif == V_pfi_all || s->kif == kif) { PF_STATE_LOCK(s); PF_HASHROW_UNLOCK(kh); if (s->timeout >= PFTM_MAX) { /* * State is either being processed by * pf_unlink_state() in an other thread, or * is scheduled for immediate expiry. */ PF_STATE_UNLOCK(s); return (NULL); } return (s); } PF_HASHROW_UNLOCK(kh); return (NULL); } struct pf_state * pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s, *ret = NULL; int idx, inout = 0; counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1); kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } switch (dir) { case PF_IN: idx = PF_SK_WIRE; break; case PF_OUT: idx = PF_SK_STACK; break; case PF_INOUT: idx = PF_SK_WIRE; inout = 1; break; default: panic("%s: dir %u", __func__, dir); } second_run: TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { if (more == NULL) { PF_HASHROW_UNLOCK(kh); return (s); } if (ret) (*more)++; else ret = s; } if (inout == 1) { inout = 0; idx = PF_SK_STACK; goto second_run; } PF_HASHROW_UNLOCK(kh); return (ret); } /* END state table stuff */ static void pf_send(struct pf_send_entry *pfse) { PF_SENDQ_LOCK(); STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); PF_SENDQ_UNLOCK(); swi_sched(V_pf_swi_cookie, 0); } void pf_intr(void *v) { struct pf_send_head queue; struct pf_send_entry *pfse, *next; CURVNET_SET((struct vnet *)v); PF_SENDQ_LOCK(); queue = V_pf_sendqueue; STAILQ_INIT(&V_pf_sendqueue); PF_SENDQ_UNLOCK(); STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { switch (pfse->pfse_type) { #ifdef INET case PFSE_IP: ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); break; case PFSE_ICMP: icmp_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, 0, pfse->icmpopts.mtu); break; #endif /* INET */ #ifdef INET6 case PFSE_IP6: ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, NULL); break; case PFSE_ICMP6: icmp6_error(pfse->pfse_m, pfse->icmpopts.type, pfse->icmpopts.code, pfse->icmpopts.mtu); break; #endif /* INET6 */ default: panic("%s: unknown type", __func__); } free(pfse, M_PFTEMP); } CURVNET_RESTORE(); } void pf_purge_thread(void *unused __unused) { VNET_ITERATOR_DECL(vnet_iter); sx_xlock(&pf_end_lock); while (pf_end_threads == 0) { sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", hz / 10); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* Wait until V_pf_default_rule is initialized. */ if (V_pf_vnet_active == 0) { CURVNET_RESTORE(); continue; } /* * Process 1/interval fraction of the state * table every run. */ V_pf_purge_idx = pf_purge_expired_states(V_pf_purge_idx, pf_hashmask / (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); /* * Purge other expired types every * PFTM_INTERVAL seconds. */ if (V_pf_purge_idx == 0) { /* * Order is important: * - states and src nodes reference rules * - states and rules reference kifs */ pf_purge_expired_fragments(); pf_purge_expired_src_nodes(); pf_purge_unlinked_rules(); pfi_kif_purge(); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } pf_end_threads++; sx_xunlock(&pf_end_lock); kproc_exit(0); } void pf_unload_vnet_purge(void) { /* * To cleanse up all kifs and rules we need * two runs: first one clears reference flags, * then pf_purge_expired_states() doesn't * raise them, and then second run frees. */ pf_purge_unlinked_rules(); pfi_kif_purge(); /* * Now purge everything. */ pf_purge_expired_states(0, pf_hashmask); pf_purge_fragments(UINT_MAX); pf_purge_expired_src_nodes(); /* * Now all kifs & rules should be unreferenced, * thus should be successfully freed. */ pf_purge_unlinked_rules(); pfi_kif_purge(); } u_int32_t pf_state_expires(const struct pf_state *state) { u_int32_t timeout; u_int32_t start; u_int32_t end; u_int32_t states; /* handle all PFTM_* > PFTM_MAX here */ if (state->timeout == PFTM_PURGE) return (time_uptime); KASSERT(state->timeout != PFTM_UNLINKED, ("pf_state_expires: timeout == PFTM_UNLINKED")); KASSERT((state->timeout < PFTM_MAX), ("pf_state_expires: timeout > PFTM_MAX")); timeout = state->rule.ptr->timeout[state->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; if (start && state->rule.ptr != &V_pf_default_rule) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = counter_u64_fetch(state->rule.ptr->states_cur); } else { start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; states = V_pf_status.states; } if (end && states > start && start < end) { if (states < end) { timeout = (u_int64_t)timeout * (end - states) / (end - start); return (state->expire + timeout); } else return (time_uptime); } return (state->expire + timeout); } void pf_purge_expired_src_nodes() { struct pf_src_node_list freelist; struct pf_srchash *sh; struct pf_src_node *cur, *next; int i; LIST_INIT(&freelist); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { PF_HASHROW_LOCK(sh); LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) if (cur->states == 0 && cur->expire <= time_uptime) { pf_unlink_src_node(cur); LIST_INSERT_HEAD(&freelist, cur, entry); } else if (cur->rule.ptr != NULL) cur->rule.ptr->rule_flag |= PFRULE_REFS; PF_HASHROW_UNLOCK(sh); } pf_free_src_nodes(&freelist); V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z); } static void pf_src_tree_remove_state(struct pf_state *s) { struct pf_src_node *sn; struct pf_srchash *sh; uint32_t timeout; timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ? s->rule.ptr->timeout[PFTM_SRC_NODE] : V_pf_default_rule.timeout[PFTM_SRC_NODE]; if (s->src_node != NULL) { sn = s->src_node; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (s->src.tcp_est) --sn->conn; if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_HASHROW_UNLOCK(sh); } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { sn = s->nat_src_node; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (--sn->states == 0) sn->expire = time_uptime + timeout; PF_HASHROW_UNLOCK(sh); } s->src_node = s->nat_src_node = NULL; } /* * Unlink and potentilly free a state. Function may be * called with ID hash row locked, but always returns * unlocked, since it needs to go through key hash locking. */ int pf_unlink_state(struct pf_state *s, u_int flags) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; if ((flags & PF_ENTER_LOCKED) == 0) PF_HASHROW_LOCK(ih); else PF_HASHROW_ASSERT(ih); if (s->timeout == PFTM_UNLINKED) { /* * State is being processed * by pf_unlink_state() in * an other thread. */ PF_HASHROW_UNLOCK(ih); return (0); /* XXXGL: undefined actually */ } if (s->src.state == PF_TCPS_PROXY_DST) { /* XXX wire key the right one? */ pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af, &s->key[PF_SK_WIRE]->addr[1], &s->key[PF_SK_WIRE]->addr[0], s->key[PF_SK_WIRE]->port[1], s->key[PF_SK_WIRE]->port[0], s->src.seqhi, s->src.seqlo + 1, TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL); } LIST_REMOVE(s, entry); pf_src_tree_remove_state(s); if (V_pfsync_delete_state_ptr != NULL) V_pfsync_delete_state_ptr(s); STATE_DEC_COUNTERS(s); s->timeout = PFTM_UNLINKED; PF_HASHROW_UNLOCK(ih); pf_detach_state(s); /* pf_state_insert() initialises refs to 2, so we can never release the * last reference here, only in pf_release_state(). */ (void)refcount_release(&s->refs); return (pf_release_state(s)); } void pf_free_state(struct pf_state *cur) { KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, cur->timeout)); pf_normalize_tcp_cleanup(cur); uma_zfree(V_pf_state_z, cur); counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1); } /* * Called only from pf_purge_thread(), thus serialized. */ static u_int pf_purge_expired_states(u_int i, int maxcheck) { struct pf_idhash *ih; struct pf_state *s; V_pf_status.states = uma_zone_get_cur(V_pf_state_z); /* * Go through hash and unlink states that expire now. */ while (maxcheck > 0) { ih = &V_pf_idhash[i]; /* only take the lock if we expect to do work */ if (!LIST_EMPTY(&ih->states)) { relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (pf_state_expires(s) <= time_uptime) { V_pf_status.states -= pf_unlink_state(s, PF_ENTER_LOCKED); goto relock; } s->rule.ptr->rule_flag |= PFRULE_REFS; if (s->nat_rule.ptr != NULL) s->nat_rule.ptr->rule_flag |= PFRULE_REFS; if (s->anchor.ptr != NULL) s->anchor.ptr->rule_flag |= PFRULE_REFS; s->kif->pfik_flags |= PFI_IFLAG_REFS; if (s->rt_kif) s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } PF_HASHROW_UNLOCK(ih); } /* Return when we hit end of hash. */ if (++i > pf_hashmask) { V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (0); } maxcheck--; } V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (i); } static void pf_purge_unlinked_rules() { struct pf_rulequeue tmpq; struct pf_rule *r, *r1; /* * If we have overloading task pending, then we'd * better skip purging this time. There is a tiny * probability that overloading task references * an already unlinked rule. */ PF_OVERLOADQ_LOCK(); if (!SLIST_EMPTY(&V_pf_overloadqueue)) { PF_OVERLOADQ_UNLOCK(); return; } PF_OVERLOADQ_UNLOCK(); /* * Do naive mark-and-sweep garbage collecting of old rules. * Reference flag is raised by pf_purge_expired_states() * and pf_purge_expired_src_nodes(). * * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, * use a temporary queue. */ TAILQ_INIT(&tmpq); PF_UNLNKDRULES_LOCK(); TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { if (!(r->rule_flag & PFRULE_REFS)) { TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); TAILQ_INSERT_TAIL(&tmpq, r, entries); } else r->rule_flag &= ~PFRULE_REFS; } PF_UNLNKDRULES_UNLOCK(); if (!TAILQ_EMPTY(&tmpq)) { PF_RULES_WLOCK(); TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { TAILQ_REMOVE(&tmpq, r, entries); pf_free_rule(r); } PF_RULES_WUNLOCK(); } } void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { u_int32_t a = ntohl(addr->addr32[0]); printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, (a>>8)&255, a&255); if (p) { p = ntohs(p); printf(":%u", p); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { u_int16_t b; u_int8_t i, curstart, curend, maxstart, maxend; curstart = curend = maxstart = maxend = 255; for (i = 0; i < 8; i++) { if (!addr->addr16[i]) { if (curstart == 255) curstart = i; curend = i; } else { if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } curstart = curend = 255; } } if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } for (i = 0; i < 8; i++) { if (i >= maxstart && i <= maxend) { if (i == 0) printf(":"); if (i == maxend) printf(":"); } else { b = ntohs(addr->addr16[i]); printf("%x", b); if (i < 7) printf(":"); } } if (p) { p = ntohs(p); printf("[%u]", p); } break; } #endif /* INET6 */ } } void pf_print_state(struct pf_state *s) { pf_print_state_parts(s, NULL, NULL); } static void pf_print_state_parts(struct pf_state *s, struct pf_state_key *skwp, struct pf_state_key *sksp) { struct pf_state_key *skw, *sks; u_int8_t proto, dir; /* Do our best to fill these, but they're skipped if NULL */ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); proto = skw ? skw->proto : (sks ? sks->proto : 0); dir = s ? s->direction : 0; switch (proto) { case IPPROTO_IPV4: printf("IPv4"); break; case IPPROTO_IPV6: printf("IPv6"); break; case IPPROTO_TCP: printf("TCP"); break; case IPPROTO_UDP: printf("UDP"); break; case IPPROTO_ICMP: printf("ICMP"); break; case IPPROTO_ICMPV6: printf("ICMPv6"); break; default: printf("%u", proto); break; } switch (dir) { case PF_IN: printf(" in"); break; case PF_OUT: printf(" out"); break; } if (skw) { printf(" wire: "); pf_print_host(&skw->addr[0], skw->port[0], skw->af); printf(" "); pf_print_host(&skw->addr[1], skw->port[1], skw->af); } if (sks) { printf(" stack: "); if (sks != skw) { pf_print_host(&sks->addr[0], sks->port[0], sks->af); printf(" "); pf_print_host(&sks->addr[1], sks->port[1], sks->af); } else printf("-"); } if (s) { if (proto == IPPROTO_TCP) { printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); printf("]"); printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); printf("]"); } printf(" %u:%u", s->src.state, s->dst.state); } } void pf_print_flags(u_int8_t f) { if (f) printf(" "); if (f & TH_FIN) printf("F"); if (f & TH_SYN) printf("S"); if (f & TH_RST) printf("R"); if (f & TH_PUSH) printf("P"); if (f & TH_ACK) printf("A"); if (f & TH_URG) printf("U"); if (f & TH_ECE) printf("E"); if (f & TH_CWR) printf("W"); } #define PF_SET_SKIP_STEPS(i) \ do { \ while (head[i] != cur) { \ head[i]->skip[i].ptr = cur; \ head[i] = TAILQ_NEXT(head[i], entries); \ } \ } while (0) void pf_calc_skip_steps(struct pf_rulequeue *rules) { struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; int i; cur = TAILQ_FIRST(rules); prev = cur; for (i = 0; i < PF_SKIP_COUNT; ++i) head[i] = cur; while (cur != NULL) { if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) PF_SET_SKIP_STEPS(PF_SKIP_IFP); if (cur->direction != prev->direction) PF_SET_SKIP_STEPS(PF_SKIP_DIR); if (cur->af != prev->af) PF_SET_SKIP_STEPS(PF_SKIP_AF); if (cur->proto != prev->proto) PF_SET_SKIP_STEPS(PF_SKIP_PROTO); if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); if (cur->src.port[0] != prev->src.port[0] || cur->src.port[1] != prev->src.port[1] || cur->src.port_op != prev->src.port_op) PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); if (cur->dst.port[0] != prev->dst.port[0] || cur->dst.port[1] != prev->dst.port[1] || cur->dst.port_op != prev->dst.port_op) PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); prev = cur; cur = TAILQ_NEXT(cur, entries); } for (i = 0; i < PF_SKIP_COUNT; ++i) PF_SET_SKIP_STEPS(i); } static int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { if (aw1->type != aw2->type) return (1); switch (aw1->type) { case PF_ADDR_ADDRMASK: case PF_ADDR_RANGE: if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6)) return (1); if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6)) return (1); return (0); case PF_ADDR_DYNIFTL: return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (aw1->p.tbl != aw2->p.tbl); default: printf("invalid address type: %d\n", aw1->type); return (1); } } /** * Checksum updates are a little complicated because the checksum in the TCP/UDP * header isn't always a full checksum. In some cases (i.e. output) it's a * pseudo-header checksum, which is a partial checksum over src/dst IP * addresses, protocol number and length. * * That means we have the following cases: * * Input or forwarding: we don't have TSO, the checksum fields are full * checksums, we need to update the checksum whenever we change anything. * * Output (i.e. the checksum is a pseudo-header checksum): * x The field being updated is src/dst address or affects the length of * the packet. We need to update the pseudo-header checksum (note that this * checksum is not ones' complement). * x Some other field is being modified (e.g. src/dst port numbers): We * don't have to update anything. **/ u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { u_int32_t l; if (udp && !cksum) return (0x0000); l = cksum + old - new; l = (l >> 16) + (l & 65535); l = l & 65535; if (udp && !l) return (0xFFFF); return (l); } u_int16_t pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) return (cksum); return (pf_cksum_fixup(cksum, old, new, udp)); } static void pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) { struct pf_addr ao; u_int16_t po = *p; PF_ACPY(&ao, a, af); PF_ACPY(a, an, af); if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) *pc = ~*pc; *p = pn; switch (af) { #ifdef INET case AF_INET: *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; *pc = pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u); break; #endif /* INET6 */ } if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) { *pc = ~*pc; if (! *pc) *pc = 0xffff; } } /* Changes a u_int32_t. Uses a void * so there are no align restrictions */ void pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), ao % 65536, an % 65536, u); } void pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_proto_cksum_fixup(m, pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp), ao % 65536, an % 65536, udp); } #ifdef INET6 static void pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; PF_ACPY(&ao, a, AF_INET6); PF_ACPY(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*c, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); } #endif /* INET6 */ static void pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) { struct pf_addr oia, ooa; PF_ACPY(&oia, ia, af); if (oa) PF_ACPY(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { u_int16_t oip = *ip; u_int32_t opc; if (pc != NULL) opc = *pc; *ip = np; if (pc != NULL) *pc = pf_cksum_fixup(*pc, oip, *ip, u); *ic = pf_cksum_fixup(*ic, oip, *ip, 0); if (pc != NULL) *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ PF_ACPY(ia, na, af); switch (af) { #ifdef INET case AF_INET: { u_int32_t oh2c = *h2c; *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); break; } #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], u), oia.addr16[1], ia->addr16[1], u), oia.addr16[2], ia->addr16[2], u), oia.addr16[3], ia->addr16[3], u), oia.addr16[4], ia->addr16[4], u), oia.addr16[5], ia->addr16[5], u), oia.addr16[6], ia->addr16[6], u), oia.addr16[7], ia->addr16[7], u); break; #endif /* INET6 */ } /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ if (oa) { PF_ACPY(oa, na, af); switch (af) { #ifdef INET case AF_INET: *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, ooa.addr16[0], oa->addr16[0], 0), ooa.addr16[1], oa->addr16[1], 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, ooa.addr16[0], oa->addr16[0], u), ooa.addr16[1], oa->addr16[1], u), ooa.addr16[2], oa->addr16[2], u), ooa.addr16[3], oa->addr16[3], u), ooa.addr16[4], oa->addr16[4], u), ooa.addr16[5], oa->addr16[5], u), ooa.addr16[6], oa->addr16[6], u), ooa.addr16[7], oa->addr16[7], u); break; #endif /* INET6 */ } } } /* * Need to modulate the sequence numbers in the TCP SACK option * (credits to Krzysztof Pfaff for report and patch) */ static int pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *dst) { int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; u_int8_t opts[TCP_MAXOLEN], *opt = opts; int copyback = 0, i, olen; struct sackblk sack; #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) if (hlen < TCPOLEN_SACKLEN || !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) return 0; while (hlen >= TCPOLEN_SACKLEN) { olen = opt[1]; switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_SACK: if (olen > hlen) olen = hlen; if (olen >= TCPOLEN_SACKLEN) { for (i = 2; i + TCPOLEN_SACK <= olen; i += TCPOLEN_SACK) { memcpy(&sack, &opt[i], sizeof(sack)); pf_change_proto_a(m, &sack.start, &th->th_sum, htonl(ntohl(sack.start) - dst->seqdiff), 0); pf_change_proto_a(m, &sack.end, &th->th_sum, htonl(ntohl(sack.end) - dst->seqdiff), 0); memcpy(&opt[i], &sack, sizeof(sack)); } copyback = 1; } /* FALLTHROUGH */ default: if (olen < 2) olen = 2; hlen -= olen; opt += olen; } } if (copyback) m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); return (copyback); } static void pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, u_int16_t rtag, struct ifnet *ifp) { struct pf_send_entry *pfse; struct mbuf *m; int len, tlen; #ifdef INET struct ip *h = NULL; #endif /* INET */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif /* INET6 */ struct tcphdr *th; char *opt; struct pf_mtag *pf_mtag; len = 0; th = NULL; /* maximum segment size tcp option */ tlen = sizeof(struct tcphdr); if (mss) tlen += 4; switch (af) { #ifdef INET case AF_INET: len = sizeof(struct ip) + tlen; break; #endif /* INET */ #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr) + tlen; break; #endif /* INET6 */ default: panic("%s: unsupported af %d", __func__, af); } /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { free(pfse, M_PFTEMP); return; } #ifdef MAC mac_netinet_firewall_send(m); #endif if ((pf_mtag = pf_get_mtag(m)) == NULL) { free(pfse, M_PFTEMP); m_freem(m); return; } if (tag) m->m_flags |= M_SKIP_FIREWALL; pf_mtag->tag = rtag; if (r != NULL && r->rtableid >= 0) M_SETFIB(m, r->rtableid); #ifdef ALTQ if (r != NULL && r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m, struct ip *); } #endif /* ALTQ */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (af) { #ifdef INET case AF_INET: h = mtod(m, struct ip *); /* IP header fields included in the TCP checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(tlen); h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); /* IP header fields included in the TCP checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(tlen); memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); break; #endif /* INET6 */ } /* TCP header */ th->th_sport = sport; th->th_dport = dport; th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_off = tlen >> 2; th->th_flags = flags; th->th_win = htons(win); if (mss) { opt = (char *)(th + 1); opt[0] = TCPOPT_MAXSEG; opt[1] = 4; HTONS(mss); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); } switch (af) { #ifdef INET case AF_INET: /* TCP checksum */ th->th_sum = in_cksum(m, len); /* Finish the IP header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); h->ip_len = htons(len); h->ip_ttl = ttl ? ttl : V_ip_defttl; h->ip_sum = 0; pfse->pfse_type = PFSE_IP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* TCP checksum */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; pfse->pfse_type = PFSE_IP6; break; #endif /* INET6 */ } pfse->pfse_m = m; pf_send(pfse); } static void pf_return(struct pf_rule *r, struct pf_rule *nr, struct pf_pdesc *pd, struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th, struct pfi_kif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen, u_short *reason) { struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; /* undo NAT changes, if they have taken place */ if (nr != NULL) { PF_ACPY(saddr, &sk->addr[pd->sidx], af); PF_ACPY(daddr, &sk->addr[pd->didx], af); if (pd->sport) *pd->sport = sk->port[pd->sidx]; if (pd->dport) *pd->dport = sk->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } if (pd->proto == IPPROTO_TCP && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURN)) && !(th->th_flags & TH_RST)) { u_int32_t ack = ntohl(th->th_seq) + pd->p_len; int len = 0; #ifdef INET struct ip *h4; #endif #ifdef INET6 struct ip6_hdr *h6; #endif switch (af) { #ifdef INET case AF_INET: h4 = mtod(m, struct ip *); len = ntohs(h4->ip_len) - off; break; #endif #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); break; #endif } if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) REASON_SET(reason, PFRES_PROTCKSUM); else { if (th->th_flags & TH_SYN) ack++; if (th->th_flags & TH_FIN) ack++; pf_send_tcp(m, r, af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, r->return_ttl, 1, 0, kif->pfik_ifp); } } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } static int pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio) { struct m_tag *mtag; KASSERT(prio <= PF_PRIO_MAX, ("%s with invalid pcp", __func__)); mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) return (ENOMEM); m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = prio; return (0); } static int pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m) { struct m_tag *mtag; u_int8_t mpcp; mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) return (0); if (prio == PF_PRIO_ZERO) prio = 0; mpcp = *(uint8_t *)(mtag + 1); return (mpcp == prio); } static void pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, struct pf_rule *r) { struct pf_send_entry *pfse; struct mbuf *m0; struct pf_mtag *pf_mtag; /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { free(pfse, M_PFTEMP); return; } if ((pf_mtag = pf_get_mtag(m0)) == NULL) { free(pfse, M_PFTEMP); return; } /* XXX: revisit */ m0->m_flags |= M_SKIP_FIREWALL; if (r->rtableid >= 0) M_SETFIB(m0, r->rtableid); #ifdef ALTQ if (r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m0, struct ip *); } #endif /* ALTQ */ switch (af) { #ifdef INET case AF_INET: pfse->pfse_type = PFSE_ICMP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pfse->pfse_type = PFSE_ICMP6; break; #endif /* INET6 */ } pfse->pfse_m = m0; pfse->icmpopts.type = type; pfse->icmpopts.code = code; pf_send(pfse); } /* * Return 1 if the addresses a and b match (with mask m), otherwise return 0. * If n is 0, they match if they are equal. If n is != 0, they match if they * are different. */ int pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, struct pf_addr *b, sa_family_t af) { int match = 0; switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) match++; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) && ((a->addr32[1] & m->addr32[1]) == (b->addr32[1] & m->addr32[1])) && ((a->addr32[2] & m->addr32[2]) == (b->addr32[2] & m->addr32[2])) && ((a->addr32[3] & m->addr32[3]) == (b->addr32[3] & m->addr32[3]))) match++; break; #endif /* INET6 */ } if (match) { if (n) return (0); else return (1); } else { if (n) return (1); else return (0); } } /* * Return 1 if b <= a <= e, otherwise return 0. */ int pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) || (ntohl(a->addr32[0]) > ntohl(e->addr32[0]))) return (0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: { int i; /* check a >= b */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) > ntohl(b->addr32[i])) break; else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i])) return (0); /* check a <= e */ for (i = 0; i < 4; ++i) if (ntohl(a->addr32[i]) < ntohl(e->addr32[i])) break; else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i])) return (0); break; } #endif /* INET6 */ } return (1); } static int pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) { switch (op) { case PF_OP_IRG: return ((p > a1) && (p < a2)); case PF_OP_XRG: return ((p < a1) || (p > a2)); case PF_OP_RRG: return ((p >= a1) && (p <= a2)); case PF_OP_EQ: return (p == a1); case PF_OP_NE: return (p != a1); case PF_OP_LT: return (p < a1); case PF_OP_LE: return (p <= a1); case PF_OP_GT: return (p > a1); case PF_OP_GE: return (p >= a1); } return (0); /* never reached */ } int pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) { NTOHS(a1); NTOHS(a2); NTOHS(p); return (pf_match(op, a1, a2, p)); } static int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } static int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } int pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag) { if (*tag == -1) *tag = mtag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } int pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) { KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) return (ENOMEM); pd->pf_mtag->tag = tag; return (0); } #define PF_ANCHOR_STACKSIZE 32 struct pf_anchor_stackframe { struct pf_ruleset *rs; struct pf_rule *r; /* XXX: + match bit */ struct pf_anchor *child; }; /* * XXX: We rely on malloc(9) returning pointer aligned addresses. */ #define PF_ANCHORSTACK_MATCH 0x00000001 #define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_RULE(f) (struct pf_rule *) \ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) #define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ } while (0) void pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; PF_RULES_RASSERT(); if (match) *match = 0; if (*depth >= PF_ANCHOR_STACKSIZE) { printf("%s: anchor stack overflow on %s\n", __func__, (*r)->anchor->name); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { struct pf_anchor_node *parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); } int pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; struct pf_rule *fr; int quick = 0; PF_RULES_RASSERT(); do { if (*depth <= 0) break; f = stack + *depth - 1; fr = PF_ANCHOR_RULE(f); if (f->child != NULL) { struct pf_anchor_node *parent; /* * This block traverses through * a wildcard anchor. */ parent = &fr->anchor->children; if (match != NULL && *match) { /* * If any of "*" matched, then * "foo/ *" matched, mark frame * appropriately. */ PF_ANCHOR_SET_MATCH(f); *match = 0; } f->child = RB_NEXT(pf_anchor_node, parent, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (PF_ANCHOR_MATCH(f) || (match != NULL && *match)) quick = fr->quick; *r = TAILQ_NEXT(fr, entries); } while (*r == NULL); return (quick); } #ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; } } void pf_addr_inc(struct pf_addr *addr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; if (addr->addr32[2] == 0xffffffff) { addr->addr32[2] = 0; if (addr->addr32[1] == 0xffffffff) { addr->addr32[1] = 0; addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); } else addr->addr32[1] = htonl(ntohl(addr->addr32[1]) + 1); } else addr->addr32[2] = htonl(ntohl(addr->addr32[2]) + 1); } else addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; } } #endif /* INET6 */ int pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; struct inpcbinfo *pi; struct inpcb *inp; pd->lookup.uid = UID_MAX; pd->lookup.gid = GID_MAX; switch (pd->proto) { case IPPROTO_TCP: if (pd->hdr.tcp == NULL) return (-1); sport = pd->hdr.tcp->th_sport; dport = pd->hdr.tcp->th_dport; pi = &V_tcbinfo; break; case IPPROTO_UDP: if (pd->hdr.udp == NULL) return (-1); sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; pi = &V_udbinfo; break; default: return (-1); } if (direction == PF_IN) { saddr = pd->src; daddr = pd->dst; } else { u_int16_t p; p = sport; sport = dport; dport = p; saddr = pd->dst; daddr = pd->src; } switch (pd->af) { #ifdef INET case AF_INET: inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET6 */ default: return (-1); } INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; INP_RUNLOCK(inp); return (1); } static u_int8_t pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int8_t wscale = 0; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= 3) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_WINDOW: wscale = opt[2]; if (wscale > TCP_MAX_WINSHIFT) wscale = TCP_MAX_WINSHIFT; wscale |= PF_WSCALE_FLAG; /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (wscale); } static u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int16_t mss = V_tcp_mssdflt; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= TCPOLEN_MAXSEG) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_MAXSEG: bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); NTOHS(mss); /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (mss); } static u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) { #ifdef INET struct nhop4_basic nh4; #endif /* INET */ #ifdef INET6 struct nhop6_basic nh6; struct in6_addr dst6; uint32_t scopeid; #endif /* INET6 */ int hlen = 0; uint16_t mss = 0; switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0) mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); in6_splitscope(&addr->v6, &dst6, &scopeid); if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0) mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr); break; #endif /* INET6 */ } mss = max(V_tcp_mssdflt, mss); mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); } static u_int32_t pf_tcp_iss(struct pf_pdesc *pd) { MD5_CTX ctx; u_int32_t digest[4]; if (V_pf_tcp_secret_init == 0) { arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); MD5Init(&V_pf_tcp_secret_ctx); MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); V_pf_tcp_secret_init = 1; } ctx = V_pf_tcp_secret_ctx; MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short)); MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short)); if (pd->af == AF_INET6) { MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); } else { MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); } MD5Final((u_char *)digest, &ctx); V_pf_tcp_iss_off += 4096; #define ISN_RANDOM_INCREMENT (4096 - 1) return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + V_pf_tcp_iss_off); #undef ISN_RANDOM_INCREMENT } static int pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp) { struct pf_rule *nr = NULL; struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; struct tcphdr *th = pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; u_short reason; int rewrite = 0, hdrlen = 0; int tag = -1, rtableid = -1; int asd = 0; int match = 0; int state_icmp = 0; u_int16_t sport = 0, dport = 0; u_int16_t bproto_sum = 0, bip_sum = 0; u_int8_t icmptype = 0, icmpcode = 0; struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; pd->lookup.done = 1; } switch (pd->proto) { case IPPROTO_TCP: sport = th->th_sport; dport = th->th_dport; hdrlen = sizeof(*th); break; case IPPROTO_UDP: sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; hdrlen = sizeof(*pd->hdr.udp); break; #ifdef INET case IPPROTO_ICMP: if (pd->af != AF_INET) break; sport = dport = pd->hdr.icmp->icmp_id; hdrlen = sizeof(*pd->hdr.icmp); icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: if (af != AF_INET6) break; sport = dport = pd->hdr.icmp6->icmp6_id; hdrlen = sizeof(*pd->hdr.icmp6); icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ default: sport = dport = hdrlen = 0; break; } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk, &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { KASSERT(sk != NULL, ("%s: null sk", __func__)); KASSERT(nk != NULL, ("%s: null nk", __func__)); if (pd->ip_sum) bip_sum = *pd->ip_sum; switch (pd->proto) { case IPPROTO_TCP: bproto_sum = th->th_sum; pd->proto_sum = &th->th_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, af); pd->sport = &th->th_sport; sport = th->th_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, af); dport = th->th_dport; pd->dport = &th->th_dport; } rewrite++; break; case IPPROTO_UDP: bproto_sum = pd->hdr.udp->uh_sum; pd->proto_sum = &pd->hdr.udp->uh_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, af); sport = pd->hdr.udp->uh_sport; pd->sport = &pd->hdr.udp->uh_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, af); dport = pd->hdr.udp->uh_dport; pd->dport = &pd->hdr.udp->uh_dport; } rewrite++; break; #ifdef INET case IPPROTO_ICMP: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[1] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, sport, nk->port[1], 0); pd->hdr.icmp->icmp_id = nk->port[1]; pd->sport = &pd->hdr.icmp->icmp_id; } m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); rewrite++; break; #endif /* INET */ default: switch (af) { #ifdef INET case AF_INET: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) PF_ACPY(saddr, &nk->addr[pd->sidx], af); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) PF_ACPY(daddr, &nk->addr[pd->didx], af); break; #endif /* INET */ } break; } if (nr->natpass) r = NULL; pd->nat_rule = nr; } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ else if (r->type && r->type != icmptype + 1) r = TAILQ_NEXT(r, entries); /* icmp only. type always 0 in other cases */ else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); /* tcp/udp only. uid.op always 0 in other cases */ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); /* tcp/udp only. gid.op always 0 in other cases */ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->log)) { if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd, 1); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason); } if (r->action == PF_DROP) goto cleanup; if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } if (rtableid >= 0) M_SETFIB(m, rtableid); if (!state_icmp && (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM))) { int action; action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, hdrlen); if (action != PF_PASS) { if (action == PF_DROP && (r->rule_flag & PFRULE_RETURN)) pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum, bip_sum, hdrlen, &reason); return (action); } } else { if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && direction == PF_OUT && V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m)) /* * We want the state created, but we dont * want to send this in case a partner * firewall has to know about it to allow * replies through it. */ return (PF_DEFER); return (PF_PASS); cleanup: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); return (PF_DROP); } static int pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a, struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk, struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen) { struct pf_state *s = NULL; struct pf_src_node *sn = NULL; struct tcphdr *th = pd->hdr.tcp; u_int16_t mss = V_tcp_mssdflt; u_short reason; /* check maximums */ if (r->max_states && (counter_u64_fetch(r->states_cur) >= r->max_states)) { counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1); REASON_SET(&reason, PFRES_MAXSTATES); goto csfailed; } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto csfailed; } s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); if (r->allow_opts) s->state_flags |= PFSTATE_ALLOWOPTS; if (r->rule_flag & PFRULE_STATESLOPPY) s->state_flags |= PFSTATE_SLOPPY; s->log = r->log & PF_LOG_ALL; s->sync_state = PFSYNC_S_NONE; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; switch (pd->proto) { case IPPROTO_TCP: s->src.seqlo = ntohl(th->th_seq); s->src.seqhi = s->src.seqlo + pd->p_len + 1; if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_MODULATE) { /* Generate sequence number modulator */ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == 0) s->src.seqdiff = 1; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(s->src.seqlo + s->src.seqdiff), 0); *rewrite = 1; } else s->src.seqdiff = 0; if (th->th_flags & TH_SYN) { s->src.seqhi++; s->src.wscale = pf_get_wscale(m, off, th->th_off, pd->af); } s->src.max_win = MAX(ntohs(th->th_win), 1); if (s->src.wscale & PF_WSCALE_MASK) { /* Remove scale factor from initial window */ int win = s->src.max_win; win += 1 << (s->src.wscale & PF_WSCALE_MASK); s->src.max_win = (win - 1) >> (s->src.wscale & PF_WSCALE_MASK); } if (th->th_flags & TH_FIN) s->src.seqhi++; s->dst.seqhi = 1; s->dst.max_win = 1; s->src.state = TCPS_SYN_SENT; s->dst.state = TCPS_CLOSED; s->timeout = PFTM_TCP_FIRST_PACKET; break; case IPPROTO_UDP: s->src.state = PFUDPS_SINGLE; s->dst.state = PFUDPS_NO_TRAFFIC; s->timeout = PFTM_UDP_FIRST_PACKET; break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif s->timeout = PFTM_ICMP_FIRST_PACKET; break; default: s->src.state = PFOTHERS_SINGLE; s->dst.state = PFOTHERS_NO_TRAFFIC; s->timeout = PFTM_OTHER_FIRST_PACKET; } if (r->rt) { if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) { REASON_SET(&reason, PFRES_MAPFAILED); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); goto csfailed; } s->rt_kif = r->rpool.cur->kif; } s->creation = time_uptime; s->expire = time_uptime; if (sn != NULL) s->src_node = sn; if (nsn != NULL) { /* XXX We only modify one side for now. */ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); s->nat_src_node = nsn; } if (pd->proto == IPPROTO_TCP) { if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { REASON_SET(&reason, PFRES_MEMORY); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, &s->src, &s->dst, rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, ("pf_normalize_tcp_stateful failed on first pkt")); pf_normalize_tcp_cleanup(s); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } } s->direction = pd->dir; /* * sk/nk could already been setup by pf_get_translation(). */ if (nr == NULL) { KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); if (sk == NULL) goto csfailed; nk = sk; } else KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); /* Swap sk/nk for PF_OUT. */ if (pf_state_insert(BOUND_IFACE(r, kif), (pd->dir == PF_IN) ? sk : nk, (pd->dir == PF_IN) ? nk : sk, s)) { if (pd->proto == IPPROTO_TCP) pf_normalize_tcp_cleanup(s); REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } else *sm = s; if (tag > 0) s->tag = tag; if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { s->src.state = PF_TCPS_PROXY_SRC; /* undo NAT changes, if they have taken place */ if (nr != NULL) { struct pf_state_key *skt = s->key[PF_SK_WIRE]; if (pd->dir == PF_OUT) skt = s->key[PF_SK_STACK]; PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); if (pd->sport) *pd->sport = skt->port[pd->sidx]; if (pd->dport) *pd->dport = skt->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } s->src.seqhi = htonl(arc4random()); /* Find mss option */ int rtid = M_GETFIB(m); mss = pf_get_mss(m, off, th->th_off, pd->af); mss = pf_calc_mss(pd->src, pd->af, rtid, mss); mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); s->src.mss = mss; pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL); REASON_SET(&reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } return (PF_PASS); csfailed: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); if (sn != NULL) { struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)]; PF_HASHROW_LOCK(sh); if (--sn->states == 0 && sn->expire == 0) { pf_unlink_src_node(sn); uma_zfree(V_pf_sources_z, sn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_HASHROW_UNLOCK(sh); } if (nsn != sn && nsn != NULL) { struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)]; PF_HASHROW_LOCK(sh); if (--nsn->states == 0 && nsn->expire == 0) { pf_unlink_src_node(nsn); uma_zfree(V_pf_sources_z, nsn); counter_u64_add( V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1); } PF_HASHROW_UNLOCK(sh); } return (PF_DROP); } static int pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm) { struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; sa_family_t af = pd->af; u_short reason; int tag = -1; int asd = 0; int match = 0; struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->src.port_op || r->dst.port_op || r->flagset)) r = TAILQ_NEXT(r, entries); else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) r = TAILQ_NEXT(r, entries); else if (r->prio && !pf_match_ieee8021q_pcp(r->prio, m)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= (arc4random() % (UINT_MAX - 1) + 1)) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else { if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log) PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd, 1); if (r->action != PF_PASS) return (PF_DROP); if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, int *copyback) { struct tcphdr *th = pd->hdr.tcp; u_int16_t win = ntohs(th->th_win); u_int32_t ack, end, seq, orig_seq; u_int8_t sws, dws; int ackskew; if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { sws = src->wscale & PF_WSCALE_MASK; dws = dst->wscale & PF_WSCALE_MASK; } else sws = dws = 0; /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(th->th_seq); if (src->seqlo == 0) { /* First packet from this end. Set its state */ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && src->scrub == NULL) { if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } } /* Deferred generation of sequence number modulator */ if (dst->seqdiff && !src->seqdiff) { /* use random iss for the TCP server */ while ((src->seqdiff = arc4random() - seq) == 0) ; ack = ntohl(th->th_ack) - dst->seqdiff; pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } else { ack = ntohl(th->th_ack); } end = seq + pd->p_len; if (th->th_flags & TH_SYN) { end++; if (dst->wscale & PF_WSCALE_FLAG) { src->wscale = pf_get_wscale(m, off, th->th_off, pd->af); if (src->wscale & PF_WSCALE_FLAG) { /* Remove scale factor from initial * window */ sws = src->wscale & PF_WSCALE_MASK; win = ((u_int32_t)win + (1 << sws) - 1) >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (th->th_flags & TH_FIN) end++; src->seqlo = seq; if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); if (win > src->max_win) src->max_win = win; } else { ack = ntohl(th->th_ack) - dst->seqdiff; if (src->seqdiff) { /* Modulate sequence numbers */ pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; if (th->th_flags & TH_FIN) end++; } if ((th->th_flags & TH_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || /* broken tcp stacks do not set ack */ (dst->state < TCPS_SYN_SENT)) { /* * Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; /* * Need to demodulate the sequence numbers in any TCP SACK options * (Selective ACK). We could optionally validate the SACK values * against the current ACK window, either forwards or backwards, but * I'm not confident that SACK has been implemented properly * everywhere. It wouldn't surprise me if several stacks accidentally * SACK too far backwards of previously ACKed data. There really aren't * any security implications of bad SACKing unless the target stack * doesn't validate the option length correctly. Someone trying to * spoof into a TCP connection won't bother blindly sending SACK * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { if (pf_modulate_sack(m, off, pd, th, dst)) *copyback = 1; } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ (ackskew <= (MAXACKWINDOW << sws)) && /* Acking not more than one window forward */ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || (pd->flags & PFDESC_IP_REAS) == 0)) { /* Require an exact/+1 sequence match on resets when possible */ if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* update states */ if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) dst->state = TCPS_FIN_WAIT_2; } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; /* Fall through to PASS packet */ } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: loose state match: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); } if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* Fall through to PASS packet */ } else { if ((*state)->dst.state == TCPS_SYN_SENT && (*state)->src.state == TCPS_SYN_SENT) { /* Send RST for state mismatches during handshake */ if (!(th->th_flags & TH_RST)) pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), 0, TH_RST, 0, 0, (*state)->rule.ptr->return_ttl, 1, 0, kif->pfik_ifp); src->seqlo = 0; src->seqhi = 1; src->max_win = 1; } else if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD state: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pf_pdesc *pd, u_short *reason) { struct tcphdr *th = pd->hdr.tcp; if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) { dst->state = TCPS_FIN_WAIT_2; } else if (src->state == TCPS_SYN_SENT && dst->state < TCPS_SYN_SENT) { /* * Handle a special sloppy case where we only see one * half of the connection. If there is a ACK after * the initial SYN without ever seeing a packet from * the destination, set the connection to established. */ dst->state = src->state = TCPS_ESTABLISHED; if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (src->state == TCPS_CLOSING && dst->state == TCPS_ESTABLISHED && dst->seqlo == 0) { /* * Handle the closing of half connections where we * don't see the full bidirectional FIN/ACK+ACK * handshake. */ dst->state = TCPS_CLOSING; } } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; return (PF_PASS); } static int pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_state_key_cmp key; struct tcphdr *th = pd->hdr.tcp; int copyback = 0; struct pf_state_peer *src, *dst; struct pf_state_key *sk; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = th->th_sport; key.port[1] = th->th_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = th->th_sport; key.port[0] = th->th_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } sk = (*state)->key[pd->didx]; if ((*state)->src.state == PF_TCPS_PROXY_SRC) { if (direction != (*state)->direction) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } if (th->th_flags & TH_SYN) { if (ntohl(th->th_seq) != (*state)->src.seqlo) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, (*state)->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else (*state)->src.state = PF_TCPS_PROXY_DST; } if ((*state)->src.state == PF_TCPS_PROXY_DST) { if (direction == (*state)->direction) { if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } (*state)->src.max_win = MAX(ntohs(th->th_win), 1); if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(arc4random()); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, 0, (*state)->tag, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (((th->th_flags & (TH_SYN|TH_ACK)) != (TH_SYN|TH_ACK)) || (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else { (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); (*state)->dst.seqlo = ntohl(th->th_seq); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ntohl(th->th_seq) + 1, TH_ACK, (*state)->src.max_win, 0, 0, 0, (*state)->tag, NULL); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL); (*state)->src.seqdiff = (*state)->dst.seqhi - (*state)->src.seqlo; (*state)->dst.seqdiff = (*state)->src.seqhi - (*state)->dst.seqlo; (*state)->src.seqhi = (*state)->src.seqlo + (*state)->dst.max_win; (*state)->dst.seqhi = (*state)->dst.seqlo + (*state)->src.max_win; (*state)->src.wscale = (*state)->dst.wscale = 0; (*state)->src.state = (*state)->dst.state = TCPS_ESTABLISHED; REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && dst->state >= TCPS_FIN_WAIT_2 && src->state >= TCPS_FIN_WAIT_2) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state reuse "); pf_print_state(*state); pf_print_flags(th->th_flags); printf("\n"); } /* XXX make sure it's the same direction ?? */ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; pf_unlink_state(*state, PF_ENTER_LOCKED); *state = NULL; return (PF_DROP); } if ((*state)->state_flags & PFSTATE_SLOPPY) { if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) return (PF_DROP); } else { if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, ©back) == PF_DROP) return (PF_DROP); } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != th->th_sport) pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != th->th_dport) pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, pd->af); copyback = 1; } /* Copyback sequence modulation or stateful scrub changes if needed */ if (copyback) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); } static int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; struct udphdr *uh = pd->hdr.udp; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = uh->uh_sport; key.port[1] = uh->uh_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = uh->uh_sport; key.port[0] = uh->uh_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFUDPS_SINGLE) src->state = PFUDPS_SINGLE; if (dst->state == PFUDPS_SINGLE) dst->state = PFUDPS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) (*state)->timeout = PFTM_UDP_MULTIPLE; else (*state)->timeout = PFTM_UDP_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != uh->uh_sport) pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != uh->uh_dport) pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); } return (PF_PASS); } static int pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0, *icmpsum; u_int8_t icmptype, icmpcode; int state_icmp = 0; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } if (!state_icmp) { /* * ICMP query/reply message not related to a TCP/UDP packet. * Search for an ICMP state. */ key.af = pd->af; key.proto = pd->proto; key.port[0] = key.port[1] = icmpid; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); } STATE_LOOKUP(kif, &key, direction, *state, pd); (*state)->expire = time_uptime; (*state)->timeout = PFTM_ICMP_ERROR_REPLY; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[0] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, nk->port[pd->sidx], 0); pd->hdr.icmp->icmp_id = nk->port[pd->sidx]; } m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); break; #endif /* INET6 */ } } return (PF_PASS); } else { /* * ICMP error message in response to a TCP/UDP packet. * Extract the inner TCP/UDP header and search for that state. */ struct pf_pdesc pd2; bzero(&pd2, sizeof pd2); #ifdef INET struct ip h2; #endif /* INET */ #ifdef INET6 struct ip6_hdr h2_6; int terminal = 0; #endif /* INET6 */ int ipoff2 = 0; int off2 = 0; pd2.af = pd->af; /* Payload packet is from the opposite direction. */ pd2.sidx = (direction == PF_IN) ? 1 : 0; pd2.didx = (direction == PF_IN) ? 0 : 1; switch (pd->af) { #ifdef INET case AF_INET: /* offset of h2 in mbuf chain */ ipoff2 = off + ICMP_MINLEN; if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip)\n")); return (PF_DROP); } /* * ICMP error messages don't refer to non-first * fragments */ if (h2.ip_off & htons(IP_OFFMASK)) { REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* offset of protocol header that follows h2 */ off2 = ipoff2 + (h2.ip_hl << 2); pd2.proto = h2.ip_p; pd2.src = (struct pf_addr *)&h2.ip_src; pd2.dst = (struct pf_addr *)&h2.ip_dst; pd2.ip_sum = &h2.ip_sum; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipoff2 = off + sizeof(struct icmp6_hdr); if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip6)\n")); return (PF_DROP); } pd2.proto = h2_6.ip6_nxt; pd2.src = (struct pf_addr *)&h2_6.ip6_src; pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; pd2.ip_sum = NULL; off2 = ipoff2 + sizeof(h2_6); do { switch (pd2.proto) { case IPPROTO_FRAGMENT: /* * ICMPv6 error messages for * non-first fragments */ REASON_SET(reason, PFRES_FRAG); return (PF_DROP); case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off2, &opt6, sizeof(opt6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMPv6 short opt\n")); return (PF_DROP); } if (pd2.proto == IPPROTO_AH) off2 += (opt6.ip6e_len + 2) * 4; else off2 += (opt6.ip6e_len + 1) * 8; pd2.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); break; #endif /* INET6 */ } if (PF_ANEQ(pd->dst, pd2.src, pd->af)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d outer dst: ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" inner src: "); pf_print_host(pd2.src, 0, pd2.af); printf(" -> "); pf_print_host(pd2.dst, 0, pd2.af); printf("\n"); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; u_int32_t seq; struct pf_state_peer *src, *dst; u_int8_t dws; int copyback = 0; /* * Only the first 8 bytes of the TCP header can be * expected. Don't access any TCP header fields after * th_seq, an ackskew test is not possible. */ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(tcp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_TCP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = th.th_sport; key.port[pd2.didx] = th.th_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->dst; dst = &(*state)->src; } else { src = &(*state)->src; dst = &(*state)->dst; } if (src->wscale && dst->wscale) dws = dst->wscale & PF_WSCALE_MASK; else dws = 0; /* Demodulate sequence number */ seq = ntohl(th.th_seq) - src->seqdiff; if (src->seqdiff) { pf_change_a(&th.th_seq, icmpsum, htonl(seq), 0); copyback = 1; } if (!((*state)->state_flags & PFSTATE_SLOPPY) && (!SEQ_GEQ(src->seqhi, seq) || !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: OK ICMP %d:%d ", icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != th.th_sport) pf_change_icmp(pd2.src, &th.th_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != th.th_dport) pf_change_icmp(pd2.dst, &th.th_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); copyback = 1; } if (copyback) { switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t )&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, 8, (caddr_t)&th); } return (PF_PASS); break; } case IPPROTO_UDP: { struct udphdr uh; if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(udp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_UDP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = uh.uh_sport; key.port[pd2.didx] = uh.uh_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != uh.uh_sport) pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != uh.uh_dport) pf_change_icmp(pd2.dst, &uh.uh_dport, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); } return (PF_PASS); break; } #ifdef INET case IPPROTO_ICMP: { struct icmp iih; if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short i" "(icmp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp_id) pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp_id) pf_change_icmp(pd2.dst, &iih.icmp_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: { struct icmp6_hdr iih; if (!pf_pull_hdr(m, off2, &iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(icmp6)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMPV6; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp6_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp6_id) pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp6_id) pf_change_icmp(pd2.dst, &iih.icmp6_id, saddr, &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); m_copyback(m, off2, sizeof(struct icmp6_hdr), (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET6 */ default: { key.af = pd2.af; key.proto = pd2.proto; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = 0; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af)) pf_change_icmp(pd2.src, NULL, daddr, &nk->addr[pd2.sidx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af)) pf_change_icmp(pd2.dst, NULL, saddr, &nk->addr[pd2.didx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } } return (PF_PASS); break; } } } } static int pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = key.port[1] = 0; } else { PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = key.port[0] = 0; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFOTHERS_SINGLE) src->state = PFOTHERS_SINGLE; if (dst->state == PFOTHERS_SINGLE) dst->state = PFOTHERS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) (*state)->timeout = PFTM_OTHER_MULTIPLE; else (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; KASSERT(nk, ("%s: nk is null", __func__)); KASSERT(pd, ("%s: pd is null", __func__)); KASSERT(pd->src, ("%s: pd->src is null", __func__)); KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); #endif /* INET6 */ } } return (PF_PASS); } /* * ipoff and off are measured from the start of the mbuf chain. * h must be at "ipoff" on the mbuf chain. */ void * pf_pull_hdr(struct mbuf *m, int off, void *p, int len, u_short *actionp, u_short *reasonp, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; if (fragoff) { if (fragoff >= len) ACTION_SET(actionp, PF_PASS); else { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_FRAG); } return (NULL); } if (m->m_pkthdr.len < off + len || ntohs(h->ip_len) < off + len) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (m->m_pkthdr.len < off + len || (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < (unsigned)(off + len)) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET6 */ } m_copydata(m, off, len, p); return (p); } #ifdef RADIX_MPATH static int pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, int rtableid) { struct radix_node_head *rnh; struct sockaddr_in *dst; int ret = 1; int check_mpath; #ifdef INET6 struct sockaddr_in6 *dst6; struct route_in6 ro; #else struct route ro; #endif struct radix_node *rn; struct rtentry *rt; struct ifnet *ifp; check_mpath = 0; /* XXX: stick to table 0 for now */ rnh = rt_tables_get_rnh(0, af); if (rnh != NULL && rn_mpath_capable(rnh)) check_mpath = 1; bzero(&ro, sizeof(ro)); switch (af) { case AF_INET: dst = satosin(&ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; break; #ifdef INET6 case AF_INET6: /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. */ if (IN6_IS_SCOPE_EMBED(&addr->v6)) goto out; dst6 = (struct sockaddr_in6 *)&ro.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; break; #endif /* INET6 */ default: return (0); } /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) goto out; switch (af) { #ifdef INET6 case AF_INET6: in6_rtalloc_ign(&ro, 0, rtableid); break; #endif #ifdef INET case AF_INET: in_rtalloc_ign((struct route *)&ro, 0, rtableid); break; #endif } if (ro.ro_rt != NULL) { /* No interface given, this is a no-route check */ if (kif == NULL) goto out; if (kif->pfik_ifp == NULL) { ret = 0; goto out; } /* Perform uRPF check if passed input interface */ ret = 0; rn = (struct radix_node *)ro.ro_rt; do { rt = (struct rtentry *)rn; ifp = rt->rt_ifp; if (kif->pfik_ifp == ifp) ret = 1; rn = rn_mpath_next(rn); } while (check_mpath == 1 && rn != NULL && ret == 0); } else ret = 0; out: if (ro.ro_rt != NULL) RTFREE(ro.ro_rt); return (ret); } #endif int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, int rtableid) { #ifdef INET struct nhop4_basic nh4; #endif #ifdef INET6 struct nhop6_basic nh6; #endif struct ifnet *ifp; #ifdef RADIX_MPATH struct radix_node_head *rnh; /* XXX: stick to table 0 for now */ rnh = rt_tables_get_rnh(0, af); if (rnh != NULL && rn_mpath_capable(rnh)) return (pf_routable_oldmpath(addr, af, kif, rtableid)); #endif /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. */ if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6)) return (1); if (af != AF_INET && af != AF_INET6) return (0); /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) return (1); ifp = NULL; switch (af) { #ifdef INET6 case AF_INET6: if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0) return (0); ifp = nh6.nh_ifp; break; #endif #ifdef INET case AF_INET: if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0) return (0); ifp = nh4.nh_ifp; break; #endif } /* No interface given, this is a no-route check */ if (kif == NULL) return (1); if (kif->pfik_ifp == NULL) return (0); /* Perform uRPF check if passed input interface */ if (kif->pfik_ifp == ifp) return (1); return (0); } #ifdef INET static void pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0, *m1; struct sockaddr_in dst; struct ip *ip; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; int error = 0; uint16_t ip_len, ip_off; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { if (s) PF_STATE_UNLOCK(s); return; } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip = mtod(m0, struct ip *); bzero(&dst, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); dst.sin_addr = ip->ip_dst; bzero(&naddr, sizeof(naddr)); if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET)) dst.sin_addr.s_addr = naddr.v4.s_addr; ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET)) dst.sin_addr.s_addr = s->rt_addr.v4.s_addr; ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test(PF_OUT, 0, ifp, &m0, inp) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); goto bad; } ip = mtod(m0, struct ip *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* Copied from FreeBSD 10.0-CURRENT ip_output. */ m0->m_pkthdr.csum_flags |= CSUM_IP; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= ifp->if_mtu || (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } m_clrprotoflags(m0); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; KMOD_IPSTAT_INC(ips_cantfrag); if (r->rt != PF_DUPTO) { icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); goto done; } else goto bad; } error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist); if (error) goto bad; for (; m0; m0 = m1) { m1 = m0->m_nextpkt; m0->m_nextpkt = NULL; if (error == 0) { m_clrprotoflags(m0); error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); } else m_freem(m0); } if (error == 0) KMOD_IPSTAT_INC(ips_fragmented); done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET */ #ifdef INET6 static void pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp) { struct mbuf *m0; struct sockaddr_in6 dst; struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { if (s) PF_STATE_UNLOCK(s); return; } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip6 = mtod(m0, struct ip6_hdr *); bzero(&dst, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(dst); dst.sin6_addr = ip6->ip6_dst; bzero(&naddr, sizeof(naddr)); if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &naddr, AF_INET6); ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &s->rt_addr, AF_INET6); ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (s) PF_STATE_UNLOCK(s); if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, inp) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", __func__)); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & ~ifp->if_hwassist) { uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6); in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr)); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } /* * If the packet is too large for the outgoing interface, * send back an icmp6 error. */ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) nd6_output_ifp(ifp, ifp, m0, &dst, NULL); else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); else goto bad; } done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET6 */ /* * FreeBSD supports cksum offloads for the following drivers. * em(4), fxp(4), lge(4), ndis(4), nge(4), re(4), ti(4), txp(4), xl(4) * * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : * network driver performed cksum including pseudo header, need to verify * csum_data * CSUM_DATA_VALID : * network driver performed cksum, needs to additional pseudo header * cksum computation with partial csum_data(i.e. lack of H/W support for * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) * * After validating the cksum of packet, set both flag CSUM_DATA_VALID and * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper * TCP/UDP layer. * Also, set csum_data to 0xffff to force cksum validation. */ static int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t sum = 0; int hw_assist = 0; struct ip *ip; if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (p) { case IPPROTO_TCP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_TCP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_UDP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ break; default: return (1); } if (!hw_assist) { switch (af) { case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } } if (sum) { switch (p) { case IPPROTO_TCP: { KMOD_TCPSTAT_INC(tcps_rcvbadsum); break; } case IPPROTO_UDP: { KMOD_UDPSTAT_INC(udps_badsum); break; } #ifdef INET case IPPROTO_ICMP: { KMOD_ICMPSTAT_INC(icps_checksum); break; } #endif #ifdef INET6 case IPPROTO_ICMPV6: { KMOD_ICMP6STAT_INC(icp6s_checksum); break; } #endif /* INET6 */ } return (1); } else { if (p == IPPROTO_TCP || p == IPPROTO_UDP) { m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } return (0); } #ifdef INET int pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0; struct ip *h = NULL; struct m_tag *ipfwtag; struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, dirndx, pqid = 0; PF_RULES_RLOCK_TRACKER; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); pd.pf_mtag = pf_find_mtag(m); PF_RULES_RLOCK(); if (ip_divert_ptr != NULL && ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; goto done; } pd.pf_mtag->flags |= PF_PACKET_LOOPED; m_tag_delete(m, ipfwtag); } if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) { m->m_flags |= M_FASTFWD_OURS; pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT; } } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { /* We do IP header normalization and packet reassembly here */ action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip *); off = h->ip_hl << 2; if (off < (int)sizeof(struct ip)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } pd.src = (struct pf_addr *)&h->ip_src; pd.dst = (struct pf_addr *)&h->ip_dst; pd.sport = pd.dport = NULL; pd.ip_sum = &h->ip_sum; pd.proto_sum = NULL; pd.proto = h->ip_p; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET; pd.tos = h->ip_tos & ~IPTOS_ECN_MASK; pd.tot_len = ntohs(h->ip_len); /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); goto done; } switch (h->ip_p) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { struct icmp ih; pd.hdr.icmp = &ih; if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } #ifdef INET6 case IPPROTO_ICMPV6: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv4 packet with ICMPv6 payload\n")); goto done; } #endif default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (action == PF_PASS && h->ip_hl > 5 && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = r->log; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with ip options\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); if (r->scrub_flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) pqid = 1; if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (pqid || (pd.tos & IPTOS_LOWDELAY)) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ /* * connections redirected to loopback should not match sockets * bound specifically to loopback due to security implications, * see tcp_input() and in_pcblookup_listen(). */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN_LOOPBACK(ntohl(pd.dst->v4.s_addr))) m->m_flags |= M_SKIP_FIREWALL; if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && !PACKET_LOOPED(&pd)) { ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (ipfwtag != NULL) { ((struct ipfw_rule_ref *)(ipfwtag+1))->info = ntohs(r->divert.port); ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; if (s) PF_STATE_UNLOCK(s); m_tag_prepend(m, ipfwtag); if (m->m_flags & M_FASTFWD_OURS) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate tag\n")); } else { pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT; m->m_flags &= ~M_FASTFWD_OURS; } } ip_divert_ptr(*m0, dir == PF_IN); *m0 = NULL; return (action); } else { /* XXX: ipfw has the same behaviour! */ action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate divert tag\n")); } } if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_OUT)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_IN)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route() returns unlocked. */ if (r->rt) { pf_route(m0, r, dir, kif->pfik_ifp, s, &pd, inp); return (action); } break; } if (s) PF_STATE_UNLOCK(s); return (action); } #endif /* INET */ #ifdef INET6 int pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0, *n = NULL; struct m_tag *mtag; struct ip6_hdr *h = NULL; struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0; PF_RULES_RLOCK_TRACKER; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED) return (PF_PASS); kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); PF_RULES_RLOCK(); /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip6_hdr *); -#if 1 /* - * we do not support jumbogram yet. if we keep going, zero ip6_plen + * we do not support jumbogram. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } -#endif pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; pd.sport = pd.dport = NULL; pd.ip_sum = NULL; pd.proto_sum = NULL; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET6; pd.tos = 0; pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); pd.proto = h->ip6_nxt; do { switch (pd.proto) { case IPPROTO_FRAGMENT: action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); if (action == PF_DROP) REASON_SET(&reason, PFRES_FRAG); goto done; case IPPROTO_ROUTING: { struct ip6_rthdr rthdr; if (rh_cnt++) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 more than one rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 rthdr0\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } /* FALLTHROUGH */ } case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); action = PF_DROP; log = 1; goto done; } if (pd.proto == IPPROTO_AH) off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); /* if there's no routing header, use unmodified mbuf for checksumming */ if (!n) n = m; switch (pd.proto) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv6 packet with ICMPv4 payload\n")); goto done; } case IPPROTO_ICMPV6: { struct icmp6_hdr ih; pd.hdr.icmp6 = &ih; if (!pf_pull_hdr(m, off, &ih, sizeof(ih), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (V_pfsync_update_state_ptr != NULL) V_pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (n != m) { m_freem(n); n = NULL; } /* handle dangerous IPv6 extension headers. */ if (action == PF_PASS && rh_cnt && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = r->log; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with dangerous v6 headers\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); if (r->scrub_flags & PFSTATE_SETPRIO) { if (pd.tos & IPTOS_LOWDELAY) pqid = 1; if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate 802.1q mtag\n")); } } #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } else { if (s != NULL) pd.pf_mtag->qid_hash = pf_state_hash(s); if (pd.tos & IPTOS_LOWDELAY) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* Add hints for ecn. */ pd.pf_mtag->hdr = h; } } #endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) m->m_flags |= M_SKIP_FIREWALL; /* XXX: Anybody working on it?! */ if (r->divert.port) printf("pf: divert(9) is not supported for IPv6\n"); if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]->addr[0], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]->addr[1], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; case PF_DROP: m_freem(*m0); *m0 = NULL; break; default: /* pf_route6() returns unlocked. */ if (r->rt) { pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd, inp); return (action); } break; } if (s) PF_STATE_UNLOCK(s); /* If reassembled packet passed, create new fragments. */ if (action == PF_PASS && *m0 && (pflags & PFIL_FWD) && (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL) action = pf_refragment6(ifp, m0, mtag); return (action); } #endif /* INET6 */ Index: projects/fuse2/sys/netpfil/pf/pf_norm.c =================================================================== --- projects/fuse2/sys/netpfil/pf/pf_norm.c (revision 350434) +++ projects/fuse2/sys/netpfil/pf/pf_norm.c (revision 350435) @@ -1,2029 +1,2004 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2001 Niels Provos * Copyright 2011-2018 Alexander Bluhm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ struct pf_frent { TAILQ_ENTRY(pf_frent) fr_next; struct mbuf *fe_m; uint16_t fe_hdrlen; /* ipv4 header length with ip options ipv6, extension, fragment header */ uint16_t fe_extoff; /* last extension header offset or 0 */ uint16_t fe_len; /* fragment length */ uint16_t fe_off; /* fragment offset */ uint16_t fe_mff; /* more fragment flag */ }; struct pf_fragment_cmp { struct pf_addr frc_src; struct pf_addr frc_dst; uint32_t frc_id; sa_family_t frc_af; uint8_t frc_proto; }; struct pf_fragment { struct pf_fragment_cmp fr_key; #define fr_src fr_key.frc_src #define fr_dst fr_key.frc_dst #define fr_id fr_key.frc_id #define fr_af fr_key.frc_af #define fr_proto fr_key.frc_proto /* pointers to queue element */ struct pf_frent *fr_firstoff[PF_FRAG_ENTRY_POINTS]; /* count entries between pointers */ uint8_t fr_entries[PF_FRAG_ENTRY_POINTS]; RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; uint32_t fr_timeout; uint16_t fr_maxlen; /* maximum length of single fragment */ u_int16_t fr_holes; /* number of holes in the queue */ TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; }; struct pf_fragment_tag { uint16_t ft_hdrlen; /* header length of reassembled pkt */ uint16_t ft_extoff; /* last extension header offset or 0 */ uint16_t ft_maxlen; /* maximum fragment payload length */ uint32_t ft_id; /* fragment id */ }; static struct mtx pf_frag_mtx; MTX_SYSINIT(pf_frag_mtx, &pf_frag_mtx, "pf fragments", MTX_DEF); #define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx) #define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED) VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); VNET_DEFINE_STATIC(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) RB_HEAD(pf_frag_tree, pf_fragment); VNET_DEFINE_STATIC(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static void pf_flush_fragments(void); static void pf_free_fragment(struct pf_fragment *); static void pf_remove_fragment(struct pf_fragment *); static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, struct tcphdr *, int, sa_family_t); static struct pf_frent *pf_create_fragment(u_short *); static int pf_frent_holes(struct pf_frent *frent); static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree); static inline int pf_frent_index(struct pf_frent *); static int pf_frent_insert(struct pf_fragment *, struct pf_frent *, struct pf_frent *); void pf_frent_remove(struct pf_fragment *, struct pf_frent *); struct pf_frent *pf_frent_previous(struct pf_fragment *, struct pf_frent *); static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, struct pf_frent *, u_short *); static struct mbuf *pf_join_fragment(struct pf_fragment *); #ifdef INET static void pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t); static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *); #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **, struct ip6_hdr *, struct ip6_frag *, uint16_t, uint16_t, u_short *); static void pf_scrub_ip6(struct mbuf **, uint8_t); #endif /* INET6 */ #define DPFPRINTF(x) do { \ if (V_pf_status.debug >= PF_DEBUG_MISC) { \ printf("%s: ", __func__); \ printf x ; \ } \ } while(0) #ifdef INET static void pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key) { key->frc_src.v4 = ip->ip_src; key->frc_dst.v4 = ip->ip_dst; key->frc_af = AF_INET; key->frc_proto = ip->ip_p; key->frc_id = ip->ip_id; } #endif /* INET */ void pf_normalize_init(void) { V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); TAILQ_INIT(&V_pf_fragqueue); } void pf_normalize_cleanup(void) { uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); uma_zdestroy(V_pf_frag_z); } static int pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; if ((diff = a->fr_id - b->fr_id) != 0) return (diff); if ((diff = a->fr_proto - b->fr_proto) != 0) return (diff); if ((diff = a->fr_af - b->fr_af) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) return (diff); return (0); } void pf_purge_expired_fragments(void) { u_int32_t expire = time_uptime - V_pf_default_rule.timeout[PFTM_FRAG]; pf_purge_fragments(expire); } void pf_purge_fragments(uint32_t expire) { struct pf_fragment *frag; PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); } PF_FRAG_UNLOCK(); } /* * Try to flush old fragments to make space for new ones */ static void pf_flush_fragments(void) { struct pf_fragment *frag; int goal; PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; DPFPRINTF(("trying to free %d frag entriess\n", goal)); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) pf_free_fragment(frag); else break; } } /* Frees the fragments and all associated entries */ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; PF_FRAG_ASSERT(); /* Free all fragments */ for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = TAILQ_FIRST(&frag->fr_queue)) { TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); m_freem(frent->fe_m); uma_zfree(V_pf_frent_z, frent); } pf_remove_fragment(frag); } static struct pf_fragment * pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) { struct pf_fragment *frag; PF_FRAG_ASSERT(); frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = time_uptime; TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); } return (frag); } /* Removes a fragment from the fragment queue and frees the fragment */ static void pf_remove_fragment(struct pf_fragment *frag) { PF_FRAG_ASSERT(); KASSERT(frag, ("frag != NULL")); RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } static struct pf_frent * pf_create_fragment(u_short *reason) { struct pf_frent *frent; PF_FRAG_ASSERT(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { pf_flush_fragments(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { REASON_SET(reason, PFRES_MEMORY); return (NULL); } } return (frent); } /* * Calculate the additional holes that were created in the fragment * queue by inserting this fragment. A fragment in the middle * creates one more hole by splitting. For each connected side, * it loses one hole. * Fragment entry must be in the queue when calling this function. */ static int pf_frent_holes(struct pf_frent *frent) { struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int holes = 1; if (prev == NULL) { if (frent->fe_off == 0) holes--; } else { KASSERT(frent->fe_off != 0, ("frent->fe_off != 0")); if (frent->fe_off == prev->fe_off + prev->fe_len) holes--; } if (next == NULL) { if (!frent->fe_mff) holes--; } else { KASSERT(frent->fe_mff, ("frent->fe_mff")); if (next->fe_off == frent->fe_off + frent->fe_len) holes--; } return holes; } static inline int pf_frent_index(struct pf_frent *frent) { /* * We have an array of 16 entry points to the queue. A full size * 65535 octet IP packet can have 8192 fragments. So the queue * traversal length is at most 512 and at most 16 entry points are * checked. We need 128 additional bytes on a 64 bit architecture. */ CTASSERT(((u_int16_t)0xffff &~ 7) / (0x10000 / PF_FRAG_ENTRY_POINTS) == 16 - 1); CTASSERT(((u_int16_t)0xffff >> 3) / PF_FRAG_ENTRY_POINTS == 512 - 1); return frent->fe_off / (0x10000 / PF_FRAG_ENTRY_POINTS); } static int pf_frent_insert(struct pf_fragment *frag, struct pf_frent *frent, struct pf_frent *prev) { int index; CTASSERT(PF_FRAG_ENTRY_LIMIT <= 0xff); /* * A packet has at most 65536 octets. With 16 entry points, each one * spawns 4096 octets. We limit these to 64 fragments each, which * means on average every fragment must have at least 64 octets. */ index = pf_frent_index(frent); if (frag->fr_entries[index] >= PF_FRAG_ENTRY_LIMIT) return ENOBUFS; frag->fr_entries[index]++; if (prev == NULL) { TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); } else { KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next); } if (frag->fr_firstoff[index] == NULL) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { if (frent->fe_off < frag->fr_firstoff[index]->fe_off) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { KASSERT(prev != NULL, ("prev != NULL")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } } frag->fr_holes += pf_frent_holes(frent); return 0; } void pf_frent_remove(struct pf_fragment *frag, struct pf_frent *frent) { #ifdef INVARIANTS struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); #endif struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int index; frag->fr_holes -= pf_frent_holes(frent); index = pf_frent_index(frent); KASSERT(frag->fr_firstoff[index] != NULL, ("frent not found")); if (frag->fr_firstoff[index]->fe_off == frent->fe_off) { if (next == NULL) { frag->fr_firstoff[index] = NULL; } else { KASSERT(frent->fe_off + frent->fe_len <= next->fe_off, ("overlapping fragment")); if (pf_frent_index(next) == index) { frag->fr_firstoff[index] = next; } else { frag->fr_firstoff[index] = NULL; } } } else { KASSERT(frag->fr_firstoff[index]->fe_off < frent->fe_off, ("frag->fr_firstoff[index]->fe_off < frent->fe_off")); KASSERT(prev != NULL, ("prev != NULL")); KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); KASSERT(frag->fr_entries[index] > 0, ("No fragments remaining")); frag->fr_entries[index]--; } struct pf_frent * pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent) { struct pf_frent *prev, *next; int index; /* * If there are no fragments after frag, take the final one. Assume * that the global queue is not empty. */ prev = TAILQ_LAST(&frag->fr_queue, pf_fragq); KASSERT(prev != NULL, ("prev != NULL")); if (prev->fe_off <= frent->fe_off) return prev; /* * We want to find a fragment entry that is before frag, but still * close to it. Find the first fragment entry that is in the same * entry point or in the first entry point after that. As we have * already checked that there are entries behind frag, this will * succeed. */ for (index = pf_frent_index(frent); index < PF_FRAG_ENTRY_POINTS; index++) { prev = frag->fr_firstoff[index]; if (prev != NULL) break; } KASSERT(prev != NULL, ("prev != NULL")); /* * In prev we may have a fragment from the same entry point that is * before frent, or one that is just one position behind frent. * In the latter case, we go back one step and have the predecessor. * There may be none if the new fragment will be the first one. */ if (prev->fe_off > frent->fe_off) { prev = TAILQ_PREV(prev, pf_fragq, fr_next); if (prev == NULL) return NULL; KASSERT(prev->fe_off <= frent->fe_off, ("prev->fe_off <= frent->fe_off")); return prev; } /* * In prev is the first fragment of the entry point. The offset * of frag is behind it. Find the closest previous fragment. */ for (next = TAILQ_NEXT(prev, fr_next); next != NULL; next = TAILQ_NEXT(next, fr_next)) { if (next->fe_off > frent->fe_off) break; prev = next; } return prev; } static struct pf_fragment * pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; uint16_t total; PF_FRAG_ASSERT(); /* No empty fragments. */ if (frent->fe_len == 0) { DPFPRINTF(("bad fragment: len 0")); goto bad_fragment; } /* All fragments are 8 byte aligned. */ if (frent->fe_mff && (frent->fe_len & 0x7)) { DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len)); goto bad_fragment; } /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { DPFPRINTF(("bad fragment: max packet %d", frent->fe_off + frent->fe_len)); goto bad_fragment; } DPFPRINTF((key->frc_af == AF_INET ? "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d", key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); /* Fully buffer all of the fragments in this fragment queue. */ frag = pf_find_fragment(key, &V_pf_frag_tree); /* Create a new reassembly queue for this packet. */ if (frag == NULL) { frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { pf_flush_fragments(); frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { REASON_SET(reason, PFRES_MEMORY); goto drop_fragment; } } *(struct pf_fragment_cmp *)frag = *key; memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff)); memset(frag->fr_entries, 0, sizeof(frag->fr_entries)); frag->fr_timeout = time_uptime; frag->fr_maxlen = frent->fe_len; frag->fr_holes = 1; TAILQ_INIT(&frag->fr_queue); RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); /* We do not have a previous fragment, cannot fail. */ pf_frent_insert(frag, frent, NULL); return (frag); } KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); /* Remember maximum fragment len for refragmentation. */ if (frent->fe_len > frag->fr_maxlen) frag->fr_maxlen = frent->fe_len; /* Maximum data we have seen already. */ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; /* Non terminal fragments must have more fragments flag. */ if (frent->fe_off + frent->fe_len < total && !frent->fe_mff) goto bad_fragment; /* Check if we saw the last fragment already. */ if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) { if (frent->fe_off + frent->fe_len > total || (frent->fe_off + frent->fe_len == total && frent->fe_mff)) goto bad_fragment; } else { if (frent->fe_off + frent->fe_len == total && !frent->fe_mff) goto bad_fragment; } /* Find neighbors for newly inserted fragment */ prev = pf_frent_previous(frag, frent); if (prev == NULL) { after = TAILQ_FIRST(&frag->fr_queue); KASSERT(after != NULL, ("after != NULL")); } else { after = TAILQ_NEXT(prev, fr_next); } if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { uint16_t precut; precut = prev->fe_off + prev->fe_len - frent->fe_off; if (precut >= frent->fe_len) goto bad_fragment; DPFPRINTF(("overlap -%d", precut)); m_adj(frent->fe_m, precut); frent->fe_off += precut; frent->fe_len -= precut; } for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off; after = next) { uint16_t aftercut; aftercut = frent->fe_off + frent->fe_len - after->fe_off; DPFPRINTF(("adjust overlap %d", aftercut)); if (aftercut < after->fe_len) { m_adj(after->fe_m, aftercut); after->fe_off += aftercut; after->fe_len -= aftercut; break; } /* This fragment is completely overlapped, lose it. */ next = TAILQ_NEXT(after, fr_next); pf_frent_remove(frag, after); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); } /* If part of the queue gets too long, there is not way to recover. */ if (pf_frent_insert(frag, frent, prev)) { DPFPRINTF(("fragment queue limit exceeded")); goto bad_fragment; } return (frag); bad_fragment: REASON_SET(reason, PFRES_FRAG); drop_fragment: uma_zfree(V_pf_frent_z, frent); return (NULL); } static struct mbuf * pf_join_fragment(struct pf_fragment *frag) { struct mbuf *m, *m2; struct pf_frent *frent, *next; frent = TAILQ_FIRST(&frag->fr_queue); next = TAILQ_NEXT(frent, fr_next); m = frent->fe_m; m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); for (frent = next; frent != NULL; frent = next) { next = TAILQ_NEXT(frent, fr_next); m2 = frent->fe_m; /* Strip off ip header. */ m_adj(m2, frent->fe_hdrlen); /* Strip off any trailing bytes. */ m_adj(m2, frent->fe_len - m2->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); m_cat(m, m2); } /* Remove from fragment queue. */ pf_remove_fragment(frag); return (m); } #ifdef INET static int pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; uint16_t total, hdrlen; /* Get an entry for the fragment queue */ if ((frent = pf_create_fragment(reason)) == NULL) return (PF_DROP); frent->fe_m = m; frent->fe_hdrlen = ip->ip_hl << 2; frent->fe_extoff = 0; frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2); frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; frent->fe_mff = ntohs(ip->ip_off) & IP_MF; pf_ip2key(ip, dir, &key); if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) return (PF_DROP); /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d", frag->fr_id, frag->fr_holes)); return (PF_PASS); /* drop because *m0 is NULL, no error */ } /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen; m = *m0 = pf_join_fragment(frag); frag = NULL; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } ip = mtod(m, struct ip *); ip->ip_len = htons(hdrlen + total); ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { DPFPRINTF(("drop: too big: %d", total)); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test() */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); return (PF_PASS); } #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, uint16_t hdrlen, uint16_t extoff, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; struct m_tag *mtag; struct pf_fragment_tag *ftag; int off; uint32_t frag_id; uint16_t total, maxlen; uint8_t proto; PF_FRAG_LOCK(); /* Get an entry for the fragment queue. */ if ((frent = pf_create_fragment(reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } frent->fe_m = m; frent->fe_hdrlen = hdrlen; frent->fe_extoff = extoff; frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen; frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; key.frc_src.v6 = ip6->ip6_src; key.frc_dst.v6 = ip6->ip6_dst; key.frc_af = AF_INET6; /* Only the first fragment's protocol is relevant. */ key.frc_proto = 0; key.frc_id = fraghdr->ip6f_ident; if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } /* The mbuf is part of the fragment entry, no direct free or access. */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d", frag->fr_id, frag->fr_holes)); PF_FRAG_UNLOCK(); return (PF_PASS); /* Drop because *m0 is NULL, no error. */ } /* We have all the data. */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); extoff = frent->fe_extoff; maxlen = frag->fr_maxlen; frag_id = frag->fr_id; total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); m = *m0 = pf_join_fragment(frag); frag = NULL; PF_FRAG_UNLOCK(); /* Take protocol from first fragment header. */ m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); proto = *(mtod(m, caddr_t) + off); m = *m0; /* Delete frag6 header */ if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0) goto fail; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag), M_NOWAIT)) == NULL) goto fail; ftag = (struct pf_fragment_tag *)(mtag + 1); ftag->ft_hdrlen = hdrlen; ftag->ft_extoff = extoff; ftag->ft_maxlen = maxlen; ftag->ft_id = frag_id; m_tag_prepend(m, mtag); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total); if (extoff) { /* Write protocol into next field of last extension header. */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); *(mtod(m, char *) + off) = proto; m = *m0; } else ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { DPFPRINTF(("drop: too big: %d", total)); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen))); return (PF_PASS); fail: REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */ return (PF_DROP); } #endif /* INET6 */ #ifdef INET6 int pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag) { struct mbuf *m = *m0, *t; struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1); struct pf_pdesc pd; uint32_t frag_id; uint16_t hdrlen, extoff, maxlen; uint8_t proto; int error, action; hdrlen = ftag->ft_hdrlen; extoff = ftag->ft_extoff; maxlen = ftag->ft_maxlen; frag_id = ftag->ft_id; m_tag_delete(m, mtag); mtag = NULL; ftag = NULL; if (extoff) { int off; /* Use protocol from next field of last extension header */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT((m != NULL), ("pf_refragment6: short mbuf chain")); proto = *(mtod(m, caddr_t) + off); *(mtod(m, char *) + off) = IPPROTO_FRAGMENT; m = *m0; } else { struct ip6_hdr *hdr; hdr = mtod(m, struct ip6_hdr *); proto = hdr->ip6_nxt; hdr->ip6_nxt = IPPROTO_FRAGMENT; } /* The MTU must be a multiple of 8 bytes, or we risk doing the * fragmentation wrong. */ maxlen = maxlen & ~7; /* * Maxlen may be less than 8 if there was only a single * fragment. As it was fragmented before, add a fragment * header also for a single fragment. If total or maxlen * is less than 8, ip6_fragment() will return EMSGSIZE and * we drop the packet. */ error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id); m = (*m0)->m_nextpkt; (*m0)->m_nextpkt = NULL; if (error == 0) { /* The first mbuf contains the unfragmented packet. */ m_freem(*m0); *m0 = NULL; action = PF_PASS; } else { /* Drop expects an mbuf to free. */ DPFPRINTF(("refragment error %d", error)); action = PF_DROP; } for (t = m; m; m = t) { t = m->m_nextpkt; m->m_nextpkt = NULL; m->m_flags |= M_SKIP_FIREWALL; memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (error == 0) ip6_forward(m, 0); else m_freem(m); } return (action); } #endif /* INET6 */ #ifdef INET int pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct ip *h = mtod(m, struct ip *); int mff = (ntohs(h->ip_off) & IP_MF); int hlen = h->ip_hl << 2; u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t max; int ip_len; int ip_off; int tag = -1; int verdict; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != h->ip_p) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip_src.s_addr, AF_INET, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (hlen < (int)sizeof(struct ip)) { REASON_SET(reason, PFRES_NORM); goto drop; } if (hlen > ntohs(h->ip_len)) { REASON_SET(reason, PFRES_NORM); goto drop; } /* Clear IP_DF if the rule uses the no-df option */ if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* We will need other tests here */ if (!fragoff && !mff) goto no_fragment; /* We're dealing with a fragment now. Don't allow fragments * with IP_DF to enter the cache. If the flag was cleared by * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { DPFPRINTF(("IP_DF\n")); goto bad; } ip_len = ntohs(h->ip_len) - hlen; ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { DPFPRINTF(("mff and %d\n", ip_len)); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { DPFPRINTF(("max packet %d\n", fragoff + ip_len)); goto bad; } max = fragoff + ip_len; /* Fully buffer all of the fragments * Might return a completely reassembled mbuf, or NULL */ PF_FRAG_LOCK(); DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); verdict = pf_reassemble(m0, h, dir, reason); PF_FRAG_UNLOCK(); if (verdict != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); h = mtod(m, struct ip *); no_fragment: /* At this point, only IP_DF is allowed in ip_off */ if (h->ip_off & ~htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos); return (PF_PASS); bad: DPFPRINTF(("dropping bad fragment\n")); REASON_SET(reason, PFRES_FRAG); drop: if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif #ifdef INET6 int pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); int extoff; int off; struct ip6_ext ext; struct ip6_opt opt; - struct ip6_opt_jumbo jumbo; struct ip6_frag frag; - u_int32_t jumbolen = 0, plen; + u_int32_t plen; int optend; int ooff; u_int8_t proto; int terminal; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET6) r = r->skip[PF_SKIP_AF].ptr; #if 0 /* header chain! */ else if (r->proto && r->proto != h->ip6_nxt) r = r->skip[PF_SKIP_PROTO].ptr; #endif else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip6_src, AF_INET6, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip6_dst, AF_INET6, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; + plen = ntohs(h->ip6_plen); + /* jumbo payload option not supported */ + if (plen == 0) + goto drop; + extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; do { switch (proto) { case IPPROTO_FRAGMENT: goto fragment; break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else off += (ext.ip6e_len + 1) * 8; proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) goto shortpkt; if (opt.ip6o_type == IP6OPT_PAD1) { ooff++; continue; } if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; - switch (opt.ip6o_type) { - case IP6OPT_JUMBO: - if (h->ip6_plen != 0) - goto drop; - if (!pf_pull_hdr(m, ooff, &jumbo, - sizeof(jumbo), NULL, NULL, - AF_INET6)) - goto shortpkt; - memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, - sizeof(jumbolen)); - jumbolen = ntohl(jumbolen); - if (jumbolen <= IPV6_MAXPACKET) - goto drop; - if (sizeof(struct ip6_hdr) + jumbolen != - m->m_pkthdr.len) - goto drop; - break; - default: - break; - } + if (opt.ip6o_type == IP6OPT_JUMBO) + goto drop; ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); off = optend; proto = ext.ip6e_nxt; break; default: terminal = 1; break; } } while (!terminal); - /* jumbo payload option must be present, or plen > 0 */ - if (ntohs(h->ip6_plen) == 0) - plen = jumbolen; - else - plen = ntohs(h->ip6_plen); - if (plen == 0) - goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; pf_scrub_ip6(&m, r->min_ttl); return (PF_PASS); fragment: - /* Jumbo payload packets cannot be fragmented. */ - plen = ntohs(h->ip6_plen); - if (plen == 0 || jumbolen) - goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) goto shortpkt; /* Offset now points to data portion. */ off += sizeof(frag); /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */ if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); pd->flags |= PFDESC_IP_REAS; return (PF_PASS); shortpkt: REASON_SET(reason, PFRES_SHORT); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif /* INET6 */ int pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_rule *r, *rm = NULL; struct tcphdr *th = pd->hdr.tcp; int rewrite = 0; u_short reason; u_int8_t flags; sa_family_t af = pd->af; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { rm = r; break; } } if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) pd->flags |= PFDESC_TCP_NORM; flags = th->th_flags; if (flags & TH_SYN) { /* Illegal packet */ if (flags & TH_RST) goto tcp_drop; if (flags & TH_FIN) goto tcp_drop; } else { /* Illegal packet */ if (!(flags & (TH_ACK|TH_RST))) goto tcp_drop; } if (!(flags & TH_ACK)) { /* These flags are only valid if ACK is set */ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) goto tcp_drop; } /* Check for illegal header length */ if (th->th_off < (sizeof(struct tcphdr) >> 2)) goto tcp_drop; /* If flags changed, or reserved data set, then adjust */ if (flags != th->th_flags || th->th_x2 != 0) { u_int16_t ov, nv; ov = *(u_int16_t *)(&th->th_ack + 1); th->th_flags = flags; th->th_x2 = 0; nv = *(u_int16_t *)(&th->th_ack + 1); th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0); rewrite = 1; } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp, 0, 0); th->th_urp = 0; rewrite = 1; } /* Process options */ if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) rewrite = 1; /* copy back packet headers if we sanitized */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); tcp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } int pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) { u_int32_t tsval, tsecr; u_int8_t hdr[60]; u_int8_t *opt; KASSERT((src->scrub == NULL), ("pf_normalize_tcp_init: src->scrub != NULL")); src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); switch (pd->af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); src->scrub->pfss_ttl = h->ip_ttl; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); src->scrub->pfss_ttl = h->ip6_hlim; break; } #endif /* INET6 */ } /* * All normalizations below are only begun if we see the start of * the connections. They must all set an enabled bit in pfss_flags */ if ((th->th_flags & TH_SYN) == 0) return (0); if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: if (opt[1] >= TCPOLEN_TIMESTAMP) { src->scrub->pfss_flags |= PFSS_TIMESTAMP; src->scrub->pfss_ts_mod = htonl(arc4random()); /* note PFSS_PAWS not set yet */ memcpy(&tsval, &opt[2], sizeof(u_int32_t)); memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); src->scrub->pfss_tsval0 = ntohl(tsval); src->scrub->pfss_tsval = ntohl(tsval); src->scrub->pfss_tsecr = ntohl(tsecr); getmicrouptime(&src->scrub->pfss_last); } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } } return (0); } void pf_normalize_tcp_cleanup(struct pf_state *state) { if (state->src.scrub) uma_zfree(V_pf_state_scrub_z, state->src.scrub); if (state->dst.scrub) uma_zfree(V_pf_state_scrub_z, state->dst.scrub); /* Someday... flush the TCP segment reassembly descriptors. */ } int pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, struct tcphdr *th, struct pf_state *state, struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) { struct timeval uptime; u_int32_t tsval, tsecr; u_int tsval_from_last; u_int8_t hdr[60]; u_int8_t *opt; int copyback = 0; int got_ts = 0; KASSERT((src->scrub || dst->scrub), ("%s: src->scrub && dst->scrub!", __func__)); /* * Enforce the minimum TTL seen for this connection. Negate a common * technique to evade an intrusion detection system and confuse * firewall state code. */ switch (pd->af) { #ifdef INET case AF_INET: { if (src->scrub) { struct ip *h = mtod(m, struct ip *); if (h->ip_ttl > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip_ttl; h->ip_ttl = src->scrub->pfss_ttl; } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { if (src->scrub) { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (h->ip6_hlim > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip6_hlim; h->ip6_hlim = src->scrub->pfss_ttl; } break; } #endif /* INET6 */ } if (th->th_off > (sizeof(struct tcphdr) >> 2) && ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: /* Modulate the timestamps. Can be used for * NAT detection, OS uptime determination or * reboot detection. */ if (got_ts) { /* Huh? Multiple timestamps!? */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("multiple TS??")); pf_print_state(state); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } if (opt[1] >= TCPOLEN_TIMESTAMP) { memcpy(&tsval, &opt[2], sizeof(u_int32_t)); if (tsval && src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsval = ntohl(tsval); pf_change_proto_a(m, &opt[2], &th->th_sum, htonl(tsval + src->scrub->pfss_ts_mod), 0); copyback = 1; } /* Modulate TS reply iff valid (!0) */ memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); if (tsecr && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsecr = ntohl(tsecr) - dst->scrub->pfss_ts_mod; pf_change_proto_a(m, &opt[6], &th->th_sum, htonl(tsecr), 0); copyback = 1; } got_ts = 1; } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } if (copyback) { /* Copyback the options, caller copys back header */ *writeback = 1; m_copyback(m, off + sizeof(struct tcphdr), (th->th_off << 2) - sizeof(struct tcphdr), hdr + sizeof(struct tcphdr)); } } /* * Must invalidate PAWS checks on connections idle for too long. * The fastest allowed timestamp clock is 1ms. That turns out to * be about 24 days before it wraps. XXX Right now our lowerbound * TS echo check only works for the first 12 days of a connection * when the TS has exhausted half its 32bit space */ #define TS_MAX_IDLE (24*24*60*60) #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ getmicrouptime(&uptime); if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - state->creation > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("src idled out of PAWS\n")); pf_print_state(state); printf("\n"); } src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("dst idled out of PAWS\n")); pf_print_state(state); printf("\n"); } dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (got_ts && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Validate that the timestamps are "in-window". * RFC1323 describes TCP Timestamp options that allow * measurement of RTT (round trip time) and PAWS * (protection against wrapped sequence numbers). PAWS * gives us a set of rules for rejecting packets on * long fat pipes (packets that were somehow delayed * in transit longer than the time it took to send the * full TCP sequence space of 4Gb). We can use these * rules and infer a few others that will let us treat * the 32bit timestamp and the 32bit echoed timestamp * as sequence numbers to prevent a blind attacker from * inserting packets into a connection. * * RFC1323 tells us: * - The timestamp on this packet must be greater than * or equal to the last value echoed by the other * endpoint. The RFC says those will be discarded * since it is a dup that has already been acked. * This gives us a lowerbound on the timestamp. * timestamp >= other last echoed timestamp * - The timestamp will be less than or equal to * the last timestamp plus the time between the * last packet and now. The RFC defines the max * clock rate as 1ms. We will allow clocks to be * up to 10% fast and will allow a total difference * or 30 seconds due to a route change. And this * gives us an upperbound on the timestamp. * timestamp <= last timestamp + max ticks * We have to be careful here. Windows will send an * initial timestamp of zero and then initialize it * to a random value after the 3whs; presumably to * avoid a DoS by having to call an expensive RNG * during a SYN flood. Proof MS has at least one * good security geek. * * - The TCP timestamp option must also echo the other * endpoints timestamp. The timestamp echoed is the * one carried on the earliest unacknowledged segment * on the left edge of the sequence window. The RFC * states that the host will reject any echoed * timestamps that were larger than any ever sent. * This gives us an upperbound on the TS echo. * tescr <= largest_tsval * - The lowerbound on the TS echo is a little more * tricky to determine. The other endpoint's echoed * values will not decrease. But there may be * network conditions that re-order packets and * cause our view of them to decrease. For now the * only lowerbound we can safely determine is that * the TS echo will never be less than the original * TS. XXX There is probably a better lowerbound. * Remove TS_MAX_CONN with better lowerbound check. * tescr >= other original TS * * It is also important to note that the fastest * timestamp clock of 1ms will wrap its 32bit space in * 24 days. So we just disable TS checking after 24 * days of idle time. We actually must use a 12d * connection limit until we can come up with a better * lowerbound to the TS echo check. */ struct timeval delta_ts; int ts_fudge; /* * PFTM_TS_DIFF is how many seconds of leeway to allow * a host's timestamp. This can happen if the previous * packet got delayed in transit for much longer than * this packet. */ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; /* Calculate max ticks since the last timestamp */ #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ #define TS_MICROSECS 1000000 /* microseconds per second */ delta_ts = uptime; timevalsub(&delta_ts, &src->scrub->pfss_last); tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); if ((src->state >= TCPS_ESTABLISHED && dst->state >= TCPS_ESTABLISHED) && (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { /* Bad RFC1323 implementation or an insertion attack. * * - Solaris 2.6 and 2.7 are known to send another ACK * after the FIN,FIN|ACK,ACK closing that carries * an old timestamp. */ DPFPRINTF(("Timestamp failed %c%c%c%c\n", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " "idle: %jus %lums\n", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, delta_ts.tv_usec / 1000)); DPFPRINTF((" src->tsval: %u tsecr: %u\n", src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" "\n", dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } /* XXX I'd really like to require tsecr but it's optional */ } else if (!got_ts && (th->th_flags & TH_RST) == 0 && ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) || pd->p_len > 0 || (th->th_flags & TH_SYN)) && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Didn't send a timestamp. Timestamps aren't really useful * when: * - connection opening or closing (often not even sent). * but we must not let an attacker to put a FIN on a * data packet to sneak it through our ESTABLISHED check. * - on a TCP reset. RFC suggests not even looking at TS. * - on an empty ACK. The TS will not be echoed so it will * probably not help keep the RTT calculation in sync and * there isn't as much danger when the sequence numbers * got wrapped. So some stacks don't include TS on empty * ACKs :-( * * To minimize the disruption to mostly RFC1323 conformant * stacks, we will only require timestamps on data packets. * * And what do ya know, we cannot require timestamps on data * packets. There appear to be devices that do legitimate * TCP connection hijacking. There are HTTP devices that allow * a 3whs (with timestamps) and then buffer the HTTP request. * If the intermediate device has the HTTP response cache, it * will spoof the response but not bother timestamping its * packets. So we can look for the presence of a timestamp in * the first data packet and if there, require it in all future * packets. */ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { /* * Hey! Someone tried to sneak a packet in. Or the * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("Did not receive expected RFC1323 " "timestamp\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } } /* * We will note if a host sends his data packets with or without * timestamps. And require all data packets to contain a timestamp * if the first does. PAWS implicitly requires that all data packets be * timestamped. But I think there are middle-man devices that hijack * TCP streams immediately after the 3whs and don't timestamp their * packets (seen in a WWW accelerator or cache). */ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { if (got_ts) src->scrub->pfss_flags |= PFSS_DATA_TS; else { src->scrub->pfss_flags |= PFSS_DATA_NOTS; if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ DPFPRINTF(("Broken RFC1323 stack did not " "timestamp data packet. Disabled PAWS " "security.\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } } } /* * Update PAWS values */ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { getmicrouptime(&src->scrub->pfss_last); if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsval = tsval; if (tsecr) { if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsecr = tsecr; if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && (SEQ_LT(tsval, src->scrub->pfss_tsval0) || src->scrub->pfss_tsval0 == 0)) { /* tsval0 MUST be the lowest timestamp */ src->scrub->pfss_tsval0 = tsval; } /* Only fully initialized after a TS gets echoed */ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_flags |= PFSS_PAWS; } } /* I have a dream.... TCP segment reassembly.... */ return (0); } static int pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, int off, sa_family_t af) { u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; int rewrite = 0; u_char opts[TCP_MAXOLEN]; u_char *optp = opts; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, NULL, NULL, af)) return (rewrite); for (; cnt > 0; cnt -= optlen, optp += optlen) { opt = optp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = optp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: mss = (u_int16_t *)(optp + 2); if ((ntohs(*mss)) > r->max_mss) { th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, *mss, htons(r->max_mss), 0); *mss = htons(r->max_mss); rewrite = 1; } break; default: break; } } if (rewrite) m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); return (rewrite); } #ifdef INET static void pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos) { struct mbuf *m = *m0; struct ip *h = mtod(m, struct ip *); /* Clear IP_DF if no-df was requested */ if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip_ttl < min_ttl) { u_int16_t ip_ttl = h->ip_ttl; h->ip_ttl = min_ttl; h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); } /* Enforce tos */ if (flags & PFRULE_SET_TOS) { u_int16_t ov, nv; ov = *(u_int16_t *)h; h->ip_tos = tos | (h->ip_tos & IPTOS_ECN_MASK); nv = *(u_int16_t *)h; h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); } /* random-id, but not for fragments */ if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { uint16_t ip_id = h->ip_id; ip_fillid(h); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); } } #endif /* INET */ #ifdef INET6 static void pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl) { struct mbuf *m = *m0; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip6_hlim < min_ttl) h->ip6_hlim = min_ttl; } #endif Index: projects/fuse2/sys/riscv/riscv/copyinout.S =================================================================== --- projects/fuse2/sys/riscv/riscv/copyinout.S (revision 350434) +++ projects/fuse2/sys/riscv/riscv/copyinout.S (revision 350435) @@ -1,183 +1,184 @@ /*- * Copyright (c) 2015-2018 Ruslan Bukin * Copyright (c) 2019 Mitchell Horne * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include "assym.inc" /* * Fault handler for the copy{in,out} functions below. */ ENTRY(copyio_fault) SET_FAULT_HANDLER(x0, a1) /* Clear the handler */ EXIT_USER_ACCESS(a1) copyio_fault_nopcb: li a0, EFAULT ret END(copyio_fault) /* * copycommon - common copy routine * * a0 - Source address * a1 - Destination address * a2 - Size of copy */ .macro copycommon la a6, copyio_fault /* Get the handler address */ SET_FAULT_HANDLER(a6, a7) /* Set the handler */ ENTER_USER_ACCESS(a7) li t2, XLEN_BYTES - blt a2, t2, 3f /* Byte-copy if len < XLEN_BYTES */ + blt a2, t2, 4f /* Byte-copy if len < XLEN_BYTES */ /* * Compare lower bits of src and dest. * If they are aligned with each other, we can do word copy. */ andi t0, a0, (XLEN_BYTES-1) /* Low bits of src */ andi t1, a1, (XLEN_BYTES-1) /* Low bits of dest */ - bne t0, t1, 3f /* Misaligned. Go to byte copy */ + bne t0, t1, 4f /* Misaligned. Go to byte copy */ beqz t0, 2f /* Already word-aligned, skip ahead */ /* Byte copy until the first word-aligned address */ 1: lb a4, 0(a0) /* Load byte from src */ addi a0, a0, 1 sb a4, 0(a1) /* Store byte in dest */ addi a1, a1, 1 addi a2, a2, -1 /* len-- */ andi t0, a0, (XLEN_BYTES-1) bnez t0, 1b + j 3f /* Copy words */ 2: ld a4, 0(a0) /* Load word from src */ addi a0, a0, XLEN_BYTES sd a4, 0(a1) /* Store word in dest */ addi a1, a1, XLEN_BYTES addi a2, a2, -XLEN_BYTES /* len -= XLEN_BYTES */ - bgeu a2, t2, 2b /* Again if len >= XLEN_BYTES */ +3: bgeu a2, t2, 2b /* Again if len >= XLEN_BYTES */ /* Check if we're finished */ - beqz a2, 4f + beqz a2, 5f /* Copy any remaining bytes */ -3: lb a4, 0(a0) /* Load byte from src */ +4: lb a4, 0(a0) /* Load byte from src */ addi a0, a0, 1 sb a4, 0(a1) /* Store byte in dest */ addi a1, a1, 1 addi a2, a2, -1 /* len-- */ - bnez a2, 3b + bnez a2, 4b -4: EXIT_USER_ACCESS(a7) +5: EXIT_USER_ACCESS(a7) SET_FAULT_HANDLER(x0, a7) /* Clear the handler */ .endm /* * Copies from a kernel to user address * * int copyout(const void *kaddr, void *udaddr, size_t len) */ ENTRY(copyout) beqz a2, copyout_end /* If len == 0 then skip loop */ add a3, a1, a2 li a4, VM_MAXUSER_ADDRESS bgt a3, a4, copyio_fault_nopcb copycommon copyout_end: li a0, 0 /* return 0 */ ret END(copyout) /* * Copies from a user to kernel address * * int copyin(const void *uaddr, void *kaddr, size_t len) */ ENTRY(copyin) beqz a2, copyin_end /* If len == 0 then skip loop */ add a3, a0, a2 li a4, VM_MAXUSER_ADDRESS bgt a3, a4, copyio_fault_nopcb copycommon copyin_end: li a0, 0 /* return 0 */ ret END(copyin) /* * Copies a string from a user to kernel address * * int copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *done) */ ENTRY(copyinstr) mv a5, x0 /* count = 0 */ beqz a2, 3f /* If len == 0 then skip loop */ la a6, copyio_fault /* Get the handler address */ SET_FAULT_HANDLER(a6, a7) /* Set the handler */ ENTER_USER_ACCESS(a7) li a7, VM_MAXUSER_ADDRESS 1: bgt a0, a7, copyio_fault lb a4, 0(a0) /* Load from uaddr */ addi a0, a0, 1 sb a4, 0(a1) /* Store in kaddr */ addi a1, a1, 1 beqz a4, 2f addi a2, a2, -1 /* len-- */ addi a5, a5, 1 /* count++ */ bnez a2, 1b 2: EXIT_USER_ACCESS(a7) SET_FAULT_HANDLER(x0, a7) /* Clear the handler */ 3: beqz a3, 4f /* Check if done != NULL */ addi a5, a5, 1 /* count++ */ sd a5, 0(a3) /* done = count */ 4: mv a0, x0 /* return 0 */ beqz a4, 5f li a0, ENAMETOOLONG 5: ret END(copyinstr) Index: projects/fuse2/sys/rpc/svc_vc.c =================================================================== --- projects/fuse2/sys/rpc/svc_vc.c (revision 350434) +++ projects/fuse2/sys/rpc/svc_vc.c (revision 350435) @@ -1,994 +1,995 @@ /* $NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro"; static char *sccsid = "@(#)svc_tcp.c 2.2 88/08/01 4.0 RPCSRC"; #endif #include __FBSDID("$FreeBSD$"); /* * svc_vc.c, Server side for Connection Oriented based RPC. * * Actually implements two flavors of transporter - * a tcp rendezvouser (a listner and connection establisher) * and a record/tcp stream. */ #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *); static void svc_vc_rendezvous_destroy(SVCXPRT *); static bool_t svc_vc_null(void); static void svc_vc_destroy(SVCXPRT *); static enum xprt_stat svc_vc_stat(SVCXPRT *); static bool_t svc_vc_ack(SVCXPRT *, uint32_t *); static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *seq); static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in); static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq, void *in); static void svc_vc_backchannel_destroy(SVCXPRT *); static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *); static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *); static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in); static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, .xp_stat = svc_vc_rendezvous_stat, .xp_reply = (bool_t (*)(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null, .xp_destroy = svc_vc_rendezvous_destroy, .xp_control = svc_vc_rendezvous_control }; static struct xp_ops svc_vc_ops = { .xp_recv = svc_vc_recv, .xp_stat = svc_vc_stat, .xp_ack = svc_vc_ack, .xp_reply = svc_vc_reply, .xp_destroy = svc_vc_destroy, .xp_control = svc_vc_control }; static struct xp_ops svc_vc_backchannel_ops = { .xp_recv = svc_vc_backchannel_recv, .xp_stat = svc_vc_backchannel_stat, .xp_reply = svc_vc_backchannel_reply, .xp_destroy = svc_vc_backchannel_destroy, .xp_control = svc_vc_backchannel_control }; /* * Usage: * xprt = svc_vc_create(sock, send_buf_size, recv_buf_size); * * Creates, registers, and returns a (rpc) tcp based transporter. * Once *xprt is initialized, it is registered as a transporter * see (svc.h, xprt_register). This routine returns * a NULL if a problem occurred. * * The filedescriptor passed in is expected to refer to a bound, but * not yet connected socket. * * Since streams do buffered io similar to stdio, the caller can specify * how big the send and receive buffers are via the second and third parms; * 0 => use the system default. */ SVCXPRT * svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, size_t recvsize) { SVCXPRT *xprt; struct sockaddr* sa; int error; SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) { SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); CURVNET_RESTORE(); if (error) return (NULL); xprt = svc_vc_create_conn(pool, so, sa); free(sa, M_SONAME); return (xprt); } SOCK_UNLOCK(so); xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = NULL; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_rendezvous_ops; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) { goto cleanup_svc_vc_create; } memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); solisten(so, -1, curthread); SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); SOLISTEN_UNLOCK(so); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); return (NULL); } /* * Create a new transport for a socket optained via soaccept(). */ SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { SVCXPRT *xprt; struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; int error; bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } } cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_ops; /* * See http://www.connectathon.org/talks96/nfstcp.pdf - client * has a 5 minute timer, server has a 6 minute timer. */ xprt->xp_idletimeout = 6 * 60; memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) goto cleanup_svc_vc_create; memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); SOCKBUF_LOCK(&so->so_rcv); xprt->xp_upcallset = 1; soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); SOCKBUF_UNLOCK(&so->so_rcv); /* * Throw the transport into the active list in case it already * has some data buffered. */ sx_xlock(&xprt->xp_lock); xprt_active(xprt); sx_xunlock(&xprt->xp_lock); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); mem_free(cd, sizeof(*cd)); return (NULL); } /* * Create a new transport for a backchannel on a clnt_vc socket. */ SVCXPRT * svc_vc_create_backchannel(SVCPOOL *pool) { SVCXPRT *xprt = NULL; struct cf_conn *cd = NULL; cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = NULL; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_backchannel_ops; return (xprt); } /* * This does all of the accept except the final call to soaccept. The * caller will call soaccept after dropping its locks (soaccept may * call malloc). */ int svc_vc_accept(struct socket *head, struct socket **sop) { struct socket *so; int error = 0; short nbio; /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(curthread->td_ucred, head); if (error != 0) goto done; #endif /* * XXXGL: we want non-blocking semantics. The socket could be a * socket created by kernel as well as socket shared with userland, * so we can't be sure about presense of SS_NBIO. We also shall not * toggle it on the socket, since that may surprise userland. So we * set SS_NBIO only temporarily. */ SOLISTEN_LOCK(head); nbio = head->so_state & SS_NBIO; head->so_state |= SS_NBIO; error = solisten_dequeue(head, &so, 0); head->so_state &= (nbio & ~SS_NBIO); if (error) goto done; so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } /*ARGSUSED*/ static bool_t svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct socket *so = NULL; struct sockaddr *sa = NULL; int error; SVCXPRT *new_xprt; /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to accept a * connection from the socket and turn it into a new * transport. If the accept fails, we have drained all pending * connections so we call xprt_inactive(). */ sx_xlock(&xprt->xp_lock); error = svc_vc_accept(xprt->xp_socket, &so); if (error == EWOULDBLOCK) { /* * We must re-test for new connections after taking * the lock to protect us in the case where a new * connection arrives after our call to accept fails * with EWOULDBLOCK. */ SOLISTEN_LOCK(xprt->xp_socket); if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); } sx_xunlock(&xprt->xp_lock); sa = NULL; error = soaccept(so, &sa); if (error) { /* * XXX not sure if I need to call sofree or soclose here. */ if (sa) free(sa, M_SONAME); return (FALSE); } /* * svc_vc_create_conn will call xprt_register - we don't need * to do anything with the new connection except derefence it. */ new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa); if (!new_xprt) { soclose(so); } else { SVC_RELEASE(new_xprt); } free(sa, M_SONAME); return (FALSE); /* there is never an rpc msg to be processed */ } /*ARGSUSED*/ static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *xprt) { return (XPRT_IDLE); } static void svc_vc_destroy_common(SVCXPRT *xprt) { if (xprt->xp_socket) (void)soclose(xprt->xp_socket); if (xprt->xp_netid) (void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1); svc_xprt_free(xprt); } static void svc_vc_rendezvous_destroy(SVCXPRT *xprt) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; solisten_upcall_set(xprt->xp_socket, NULL, NULL); } SOLISTEN_UNLOCK(xprt->xp_socket); svc_vc_destroy_common(xprt); } static void svc_vc_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); svc_vc_destroy_common(xprt); if (cd->mreq) m_freem(cd->mreq); if (cd->mpending) m_freem(cd->mpending); mem_free(cd, sizeof(*cd)); } static void svc_vc_backchannel_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; struct mbuf *m, *m2; svc_xprt_free(xprt); m = cd->mreq; while (m != NULL) { m2 = m; m = m->m_nextpkt; m_freem(m2); } mem_free(cd, sizeof(*cd)); } /*ARGSUSED*/ static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static enum xprt_stat svc_vc_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->strm_stat == XPRT_DIED) return (XPRT_DIED); if (cd->mreq != NULL && cd->resid == 0 && cd->eor) return (XPRT_MOREREQS); if (soreadable(xprt->xp_socket)) return (XPRT_MOREREQS); return (XPRT_IDLE); } static bool_t svc_vc_ack(SVCXPRT *xprt, uint32_t *ack) { *ack = atomic_load_acq_32(&xprt->xp_snt_cnt); *ack -= sbused(&xprt->xp_socket->so_snd); return (TRUE); } static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->mreq != NULL) return (XPRT_MOREREQS); return (XPRT_IDLE); } /* * If we have an mbuf chain in cd->mpending, try to parse a record from it, * leaving the result in cd->mreq. If we don't have a complete record, leave * the partial result in cd->mreq and try to read more from the socket. */ static int svc_vc_process_pending(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct socket *so = xprt->xp_socket; struct mbuf *m; /* * If cd->resid is non-zero, we have part of the * record already, otherwise we are expecting a record * marker. */ if (!cd->resid && cd->mpending) { /* * See if there is enough data buffered to * make up a record marker. Make sure we can * handle the case where the record marker is * split across more than one mbuf. */ size_t n = 0; uint32_t header; m = cd->mpending; while (n < sizeof(uint32_t) && m) { n += m->m_len; m = m->m_next; } if (n < sizeof(uint32_t)) { so->so_rcv.sb_lowat = sizeof(uint32_t) - n; return (FALSE); } m_copydata(cd->mpending, 0, sizeof(header), (char *)&header); header = ntohl(header); cd->eor = (header & 0x80000000) != 0; cd->resid = header & 0x7fffffff; m_adj(cd->mpending, sizeof(uint32_t)); } /* * Start pulling off mbufs from cd->mpending * until we either have a complete record or * we run out of data. We use m_split to pull * data - it will pull as much as possible and * split the last mbuf if necessary. */ while (cd->mpending && cd->resid) { m = cd->mpending; if (cd->mpending->m_next || cd->mpending->m_len > cd->resid) cd->mpending = m_split(cd->mpending, cd->resid, M_WAITOK); else cd->mpending = NULL; if (cd->mreq) m_last(cd->mreq)->m_next = m; else cd->mreq = m; while (m) { cd->resid -= m->m_len; m = m->m_next; } } /* * Block receive upcalls if we have more data pending, * otherwise report our need. */ if (cd->mpending) so->so_rcv.sb_lowat = INT_MAX; else so->so_rcv.sb_lowat = imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2)); return (TRUE); } static bool_t svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct uio uio; struct mbuf *m; struct socket* so = xprt->xp_socket; XDR xdrs; int error, rcvflag; uint32_t xid_plus_direction[2]; /* * Serialise access to the socket and our own record parsing * state. */ sx_xlock(&xprt->xp_lock); for (;;) { /* If we have no request ready, check pending queue. */ while (cd->mpending && (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) { if (!svc_vc_process_pending(xprt)) break; } /* Process and return complete request in cd->mreq. */ if (cd->mreq != NULL && cd->resid == 0 && cd->eor) { /* * Now, check for a backchannel reply. * The XID is in the first uint32_t of the reply * and the message direction is the second one. */ if ((cd->mreq->m_len >= sizeof(xid_plus_direction) || m_length(cd->mreq, NULL) >= sizeof(xid_plus_direction)) && xprt->xp_p2 != NULL) { m_copydata(cd->mreq, 0, sizeof(xid_plus_direction), (char *)xid_plus_direction); xid_plus_direction[0] = ntohl(xid_plus_direction[0]); xid_plus_direction[1] = ntohl(xid_plus_direction[1]); /* Check message direction. */ if (xid_plus_direction[1] == REPLY) { clnt_bck_svccall(xprt->xp_p2, cd->mreq, xid_plus_direction[0]); cd->mreq = NULL; continue; } } xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE); cd->mreq = NULL; /* Check for next request in a pending queue. */ svc_vc_process_pending(xprt); if (cd->mreq == NULL || cd->resid != 0) { SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); } sx_xunlock(&xprt->xp_lock); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to * read as much as possible from the socket and put * the result in cd->mpending. If the read fails, * we have drained both cd->mpending and the socket so * we can call xprt_inactive(). */ uio.uio_resid = 1000000000; uio.uio_td = curthread; m = NULL; rcvflag = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &m, NULL, &rcvflag); if (error == EWOULDBLOCK) { /* * We must re-test for readability after * taking the lock to protect us in the case * where a new packet arrives on the socket * after our call to soreceive fails with * EWOULDBLOCK. */ SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOCKBUF_LOCK(&so->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(so, SO_RCV); } SOCKBUF_UNLOCK(&so->so_rcv); xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (!m) { /* * EOF - the other end has closed the socket. */ xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (cd->mpending) m_last(cd->mpending)->m_next = m; else cd->mpending = m; } } static bool_t svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct ct_data *ct; struct mbuf *m; XDR xdrs; sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct == NULL) { sx_xunlock(&xprt->xp_lock); return (FALSE); } mtx_lock(&ct->ct_lock); m = cd->mreq; if (m == NULL) { xprt_inactive_self(xprt); mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); return (FALSE); } cd->mreq = m->m_nextpkt; mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); xdrmbuf_create(&xdrs, m, XDR_DECODE); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } static bool_t svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, len; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); len = mrep->m_pkthdr.len; *mtod(mrep, uint32_t *) = htonl(0x80000000 | (len - sizeof(uint32_t))); atomic_add_32(&xprt->xp_snd_cnt, len); error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL, 0, curthread); if (!error) { atomic_add_rel_32(&xprt->xp_snt_cnt, len); if (seq) *seq = xprt->xp_snd_cnt; stat = TRUE; } else atomic_subtract_32(&xprt->xp_snd_cnt, len); } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { struct ct_data *ct; XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); *mtod(mrep, uint32_t *) = htonl(0x80000000 | (mrep->m_pkthdr.len - sizeof(uint32_t))); sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct != NULL) error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL, 0, curthread); else error = EPIPE; sx_xunlock(&xprt->xp_lock); if (!error) { stat = TRUE; } } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_null() { return (FALSE); } static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) xprt_active(xprt); return (SU_OK); } static int svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv * and rpc.yppasswdd on AF_LOCAL. */ int __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) { int sock, ret; gid_t egid; uid_t euid; struct sockaddr *sa; sock = transp->xp_fd; sa = (struct sockaddr *)transp->xp_rtaddr; if (sa->sa_family == AF_LOCAL) { ret = getpeereid(sock, &euid, &egid); if (ret == 0) *uid = euid; return (ret); } else return (-1); } #endif Index: projects/fuse2/sys/sys/ata.h =================================================================== --- projects/fuse2/sys/sys/ata.h (revision 350434) +++ projects/fuse2/sys/sys/ata.h (revision 350435) @@ -1,1056 +1,1059 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_ATA_H_ #define _SYS_ATA_H_ #include /* ATA/ATAPI device parameters */ struct ata_params { /*000*/ u_int16_t config; /* configuration info */ #define ATA_PROTO_MASK 0x8003 #define ATA_PROTO_ATAPI 0x8000 #define ATA_PROTO_ATAPI_12 0x8000 #define ATA_PROTO_ATAPI_16 0x8001 #define ATA_PROTO_CFA 0x848a #define ATA_ATAPI_TYPE_MASK 0x1f00 #define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */ #define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ #define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ #define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ #define ATA_DRQ_MASK 0x0060 #define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ #define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ #define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */ #define ATA_RESP_INCOMPLETE 0x0004 /*001*/ u_int16_t cylinders; /* # of cylinders */ /*002*/ u_int16_t specconf; /* specific configuration */ /*003*/ u_int16_t heads; /* # heads */ u_int16_t obsolete4; u_int16_t obsolete5; /*006*/ u_int16_t sectors; /* # sectors/track */ /*007*/ u_int16_t vendor7[3]; /*010*/ u_int8_t serial[20]; /* serial number */ /*020*/ u_int16_t retired20; u_int16_t retired21; u_int16_t obsolete22; /*023*/ u_int8_t revision[8]; /* firmware revision */ /*027*/ u_int8_t model[40]; /* model name */ /*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ /*048*/ u_int16_t tcg; /* Trusted Computing Group */ #define ATA_SUPPORT_TCG 0x0001 /*049*/ u_int16_t capabilities1; #define ATA_SUPPORT_DMA 0x0100 #define ATA_SUPPORT_LBA 0x0200 #define ATA_SUPPORT_IORDYDIS 0x0400 #define ATA_SUPPORT_IORDY 0x0800 #define ATA_SUPPORT_OVERLAP 0x4000 /*050*/ u_int16_t capabilities2; /*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */ #define ATA_RETIRED_PIO_MASK 0x0300 /*052*/ u_int16_t retired_dmamode; /* DMA modes */ #define ATA_RETIRED_DMA_MASK 0x0003 /*053*/ u_int16_t atavalid; /* fields valid */ #define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */ #define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */ #define ATA_FLAG_88 0x0004 /* word 88 valid */ /*054*/ u_int16_t current_cylinders; /*055*/ u_int16_t current_heads; /*056*/ u_int16_t current_sectors; /*057*/ u_int16_t current_size_1; /*058*/ u_int16_t current_size_2; /*059*/ u_int16_t multi; #define ATA_SUPPORT_BLOCK_ERASE_EXT 0x8000 #define ATA_SUPPORT_OVERWRITE_EXT 0x4000 #define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000 #define ATA_SUPPORT_SANITIZE 0x1000 #define ATA_SUPPORT_SANITIZE_ALLOWED 0x0800 #define ATA_SUPPORT_ANTIFREEZE_LOCK_EXT 0x0400 #define ATA_MULTI_VALID 0x0100 /*060*/ u_int16_t lba_size_1; u_int16_t lba_size_2; u_int16_t obsolete62; /*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */ /*064*/ u_int16_t apiomodes; /* advanced PIO modes */ /*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */ /*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */ /*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */ /*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */ /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 #define ATA_ENCRYPTS_ALL_USER_DATA 0x0010 /* Self-encrypting drive */ #define ATA_SUPPORT_ZONE_MASK 0x0003 #define ATA_SUPPORT_ZONE_NR 0x0000 #define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 #define ATA_SUPPORT_ZONE_DEV_MANAGED 0x0002 u_int16_t reserved70; /*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ /*072*/ u_int16_t rlsservice; /* rel time (us) for service */ u_int16_t reserved73; u_int16_t reserved74; /*075*/ u_int16_t queue; #define ATA_QUEUE_LEN(x) ((x) & 0x001f) /*76*/ u_int16_t satacapabilities; #define ATA_SATA_GEN1 0x0002 #define ATA_SATA_GEN2 0x0004 #define ATA_SATA_GEN3 0x0008 #define ATA_SUPPORT_NCQ 0x0100 #define ATA_SUPPORT_IFPWRMNGTRCV 0x0200 #define ATA_SUPPORT_PHYEVENTCNT 0x0400 #define ATA_SUPPORT_NCQ_UNLOAD 0x0800 #define ATA_SUPPORT_NCQ_PRIO 0x1000 #define ATA_SUPPORT_HAPST 0x2000 #define ATA_SUPPORT_DAPST 0x4000 #define ATA_SUPPORT_READLOGDMAEXT 0x8000 /*77*/ u_int16_t satacapabilities2; #define ATA_SATA_CURR_GEN_MASK 0x0006 #define ATA_SUPPORT_NCQ_STREAM 0x0010 -#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 +#define ATA_SUPPORT_NCQ_NON_DATA 0x0020 +#define ATA_SUPPORT_NCQ_QMANAGEMENT ATA_SUPPORT_NCQ_NON_DATA #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 /*78*/ u_int16_t satasupport; #define ATA_SUPPORT_NONZERO 0x0002 #define ATA_SUPPORT_AUTOACTIVATE 0x0004 #define ATA_SUPPORT_IFPWRMNGT 0x0008 #define ATA_SUPPORT_INORDERDATA 0x0010 #define ATA_SUPPORT_ASYNCNOTIF 0x0020 #define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 +#define ATA_SUPPORT_NCQ_AUTOSENSE 0x0080 /*79*/ u_int16_t sataenabled; #define ATA_ENABLED_DAPST 0x0080 /*080*/ u_int16_t version_major; /*081*/ u_int16_t version_minor; struct { /*082/085*/ u_int16_t command1; #define ATA_SUPPORT_SMART 0x0001 #define ATA_SUPPORT_SECURITY 0x0002 #define ATA_SUPPORT_REMOVABLE 0x0004 #define ATA_SUPPORT_POWERMGT 0x0008 #define ATA_SUPPORT_PACKET 0x0010 #define ATA_SUPPORT_WRITECACHE 0x0020 #define ATA_SUPPORT_LOOKAHEAD 0x0040 #define ATA_SUPPORT_RELEASEIRQ 0x0080 #define ATA_SUPPORT_SERVICEIRQ 0x0100 #define ATA_SUPPORT_RESET 0x0200 #define ATA_SUPPORT_PROTECTED 0x0400 #define ATA_SUPPORT_WRITEBUFFER 0x1000 #define ATA_SUPPORT_READBUFFER 0x2000 #define ATA_SUPPORT_NOP 0x4000 /*083/086*/ u_int16_t command2; #define ATA_SUPPORT_MICROCODE 0x0001 #define ATA_SUPPORT_QUEUED 0x0002 #define ATA_SUPPORT_CFA 0x0004 #define ATA_SUPPORT_APM 0x0008 #define ATA_SUPPORT_NOTIFY 0x0010 #define ATA_SUPPORT_STANDBY 0x0020 #define ATA_SUPPORT_SPINUP 0x0040 #define ATA_SUPPORT_MAXSECURITY 0x0100 #define ATA_SUPPORT_AUTOACOUSTIC 0x0200 #define ATA_SUPPORT_ADDRESS48 0x0400 #define ATA_SUPPORT_OVERLAY 0x0800 #define ATA_SUPPORT_FLUSHCACHE 0x1000 #define ATA_SUPPORT_FLUSHCACHE48 0x2000 /*084/087*/ u_int16_t extension; #define ATA_SUPPORT_SMARTLOG 0x0001 #define ATA_SUPPORT_SMARTTEST 0x0002 #define ATA_SUPPORT_MEDIASN 0x0004 #define ATA_SUPPORT_MEDIAPASS 0x0008 #define ATA_SUPPORT_STREAMING 0x0010 #define ATA_SUPPORT_GENLOG 0x0020 #define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040 #define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080 #define ATA_SUPPORT_64BITWWN 0x0100 #define ATA_SUPPORT_UNLOAD 0x2000 } __packed support, enabled; /*088*/ u_int16_t udmamodes; /* UltraDMA modes */ /*089*/ u_int16_t erase_time; /* time req'd in 2min units */ /*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */ /*091*/ u_int16_t apm_value; /*092*/ u_int16_t master_passwd_revision; /* password revision code */ /*093*/ u_int16_t hwres; #define ATA_CABLE_ID 0x2000 /*094*/ u_int16_t acoustic; #define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff) #define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8) /*095*/ u_int16_t stream_min_req_size; /*096*/ u_int16_t stream_transfer_time; /*097*/ u_int16_t stream_access_latency; /*098*/ u_int32_t stream_granularity; /*100*/ u_int16_t lba_size48_1; u_int16_t lba_size48_2; u_int16_t lba_size48_3; u_int16_t lba_size48_4; u_int16_t reserved104; /*105*/ u_int16_t max_dsm_blocks; /*106*/ u_int16_t pss; #define ATA_PSS_LSPPS 0x000F #define ATA_PSS_LSSABOVE512 0x1000 #define ATA_PSS_MULTLS 0x2000 #define ATA_PSS_VALID_MASK 0xC000 #define ATA_PSS_VALID_VALUE 0x4000 /*107*/ u_int16_t isd; /*108*/ u_int16_t wwn[4]; u_int16_t reserved112[5]; /*117*/ u_int16_t lss_1; /*118*/ u_int16_t lss_2; /*119*/ u_int16_t support2; #define ATA_SUPPORT_WRITEREADVERIFY 0x0002 #define ATA_SUPPORT_WRITEUNCORREXT 0x0004 #define ATA_SUPPORT_RWLOGDMAEXT 0x0008 #define ATA_SUPPORT_MICROCODE3 0x0010 #define ATA_SUPPORT_FREEFALL 0x0020 #define ATA_SUPPORT_SENSE_REPORT 0x0040 #define ATA_SUPPORT_EPC 0x0080 #define ATA_SUPPORT_AMAX_ADDR 0x0100 #define ATA_SUPPORT_DSN 0x0200 /*120*/ u_int16_t enabled2; #define ATA_ENABLED_WRITEREADVERIFY 0x0002 #define ATA_ENABLED_WRITEUNCORREXT 0x0004 #define ATA_ENABLED_FREEFALL 0x0020 #define ATA_ENABLED_SENSE_REPORT 0x0040 #define ATA_ENABLED_EPC 0x0080 #define ATA_ENABLED_DSN 0x0200 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; #define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */ #define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */ #define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */ #define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */ #define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */ #define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */ #define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */ u_int16_t reserved129[31]; /*160*/ u_int16_t cfa_powermode1; u_int16_t reserved161; /*162*/ u_int16_t cfa_kms_support; /*163*/ u_int16_t cfa_trueide_modes; /*164*/ u_int16_t cfa_memory_modes; u_int16_t reserved165[3]; /*168*/ u_int16_t form_factor; #define ATA_FORM_FACTOR_MASK 0x000f #define ATA_FORM_FACTOR_NOT_REPORTED 0x0000 #define ATA_FORM_FACTOR_5_25 0x0001 #define ATA_FORM_FACTOR_3_5 0x0002 #define ATA_FORM_FACTOR_2_5 0x0003 #define ATA_FORM_FACTOR_1_8 0x0004 #define ATA_FORM_FACTOR_SUB_1_8 0x0005 #define ATA_FORM_FACTOR_MSATA 0x0006 #define ATA_FORM_FACTOR_M_2 0x0007 #define ATA_FORM_FACTOR_MICRO_SSD 0x0008 #define ATA_FORM_FACTOR_C_FAST 0x0009 /*169*/ u_int16_t support_dsm; #define ATA_SUPPORT_DSM_TRIM 0x0001 - u_int16_t reserved170[6]; +/*170*/ u_int8_t product_id[8]; /* Additional Product Identifier */ + u_int16_t reserved174[2]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; u_int16_t reserved207[2]; /*209*/ u_int16_t lsalign; /*210*/ u_int16_t wrv_sectors_m3_1; u_int16_t wrv_sectors_m3_2; /*212*/ u_int16_t wrv_sectors_m2_1; u_int16_t wrv_sectors_m2_2; /*214*/ u_int16_t nv_cache_caps; /*215*/ u_int16_t nv_cache_size_1; u_int16_t nv_cache_size_2; /*217*/ u_int16_t media_rotation_rate; #define ATA_RATE_NOT_REPORTED 0x0000 #define ATA_RATE_NON_ROTATING 0x0001 u_int16_t reserved218; /*219*/ u_int16_t nv_cache_opt; /*220*/ u_int16_t wrv_mode; u_int16_t reserved221; /*222*/ u_int16_t transport_major; /*223*/ u_int16_t transport_minor; u_int16_t reserved224[31]; /*255*/ u_int16_t integrity; } __packed; /* ATA Dataset Management */ #define ATA_DSM_BLK_SIZE 512 #define ATA_DSM_BLK_RANGES 64 #define ATA_DSM_RANGE_SIZE 8 #define ATA_DSM_RANGE_MAX 65535 /* * ATA Device Register * * bit 7 Obsolete (was 1 in early ATA specs) * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS * bit 5 Obsolete (was 1 in early ATA specs) * bit 4 1 = Slave Drive, 0 = Master Drive * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number */ #define ATA_DEV_MASTER 0x00 #define ATA_DEV_SLAVE 0x10 #define ATA_DEV_LBA 0x40 /* ATA limits */ #define ATA_MAX_28BIT_LBA 268435455UL /* ATA Status Register */ #define ATA_STATUS_ERROR 0x01 #define ATA_STATUS_SENSE_AVAIL 0x02 #define ATA_STATUS_ALIGN_ERR 0x04 #define ATA_STATUS_DATA_REQ 0x08 #define ATA_STATUS_DEF_WRITE_ERR 0x10 #define ATA_STATUS_DEVICE_FAULT 0x20 #define ATA_STATUS_DEVICE_READY 0x40 #define ATA_STATUS_BUSY 0x80 /* ATA Error Register */ #define ATA_ERROR_ABORT 0x04 #define ATA_ERROR_ID_NOT_FOUND 0x10 /* ATA HPA Features */ #define ATA_HPA_FEAT_MAX_ADDR 0x00 #define ATA_HPA_FEAT_SET_PWD 0x01 #define ATA_HPA_FEAT_LOCK 0x02 #define ATA_HPA_FEAT_UNLOCK 0x03 #define ATA_HPA_FEAT_FREEZE 0x04 /* ATA transfer modes */ #define ATA_MODE_MASK 0x0f #define ATA_DMA_MASK 0xf0 #define ATA_PIO 0x00 #define ATA_PIO0 0x08 #define ATA_PIO1 0x09 #define ATA_PIO2 0x0a #define ATA_PIO3 0x0b #define ATA_PIO4 0x0c #define ATA_PIO_MAX 0x0f #define ATA_DMA 0x10 #define ATA_WDMA0 0x20 #define ATA_WDMA1 0x21 #define ATA_WDMA2 0x22 #define ATA_UDMA0 0x40 #define ATA_UDMA1 0x41 #define ATA_UDMA2 0x42 #define ATA_UDMA3 0x43 #define ATA_UDMA4 0x44 #define ATA_UDMA5 0x45 #define ATA_UDMA6 0x46 #define ATA_SA150 0x47 #define ATA_SA300 0x48 #define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f /* ATA commands */ #define ATA_NOP 0x00 /* NOP */ #define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ #define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */ #define ATA_DATA_SET_MANAGEMENT 0x06 #define ATA_DSM_TRIM 0x01 #define ATA_DEVICE_RESET 0x08 /* reset device */ #define ATA_READ 0x20 /* read */ #define ATA_READ48 0x24 /* read 48bit LBA */ #define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */ #define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */ #define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */ #define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */ #define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */ #define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */ #define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */ #define ATA_WRITE 0x30 /* write */ #define ATA_WRITE48 0x34 /* write 48bit LBA */ #define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */ #define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/ #define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */ #define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */ #define ATA_WRITE_STREAM_DMA48 0x3a #define ATA_WRITE_STREAM48 0x3b #define ATA_WRITE_DMA_FUA48 0x3d #define ATA_WRITE_DMA_QUEUED_FUA48 0x3e #define ATA_WRITE_LOG_EXT 0x3f #define ATA_READ_VERIFY 0x40 #define ATA_READ_VERIFY48 0x42 #define ATA_WRITE_UNCORRECTABLE48 0x45 /* write uncorrectable 48bit LBA */ #define ATA_WU_PSEUDO 0x55 /* pseudo-uncorrectable error */ #define ATA_WU_FLAGGED 0xaa /* flagged-uncorrectable error */ #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ #define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ #define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ #define ATA_WRITE_LOG_DMA_EXT 0x57 /* WRITE LOG DMA EXT */ #define ATA_TRUSTED_NON_DATA 0x5b /* TRUSTED NON-DATA */ #define ATA_TRUSTED_RECEIVE 0x5c /* TRUSTED RECEIVE */ #define ATA_TRUSTED_RECEIVE_DMA 0x5d /* TRUSTED RECEIVE DMA */ #define ATA_TRUSTED_SEND 0x5e /* TRUSTED SEND */ #define ATA_TRUSTED_SEND_DMA 0x5f /* TRUSTED SEND DMA */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ #define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ #define ATA_ABORT_NCQ_QUEUE 0x00 /* abort NCQ queue */ #define ATA_DEADLINE_HANDLING 0x01 /* deadline handling */ #define ATA_SET_FEATURES 0x05 /* set features */ #define ATA_ZERO_EXT 0x06 /* zero ext */ #define ATA_NCQ_ZAC_MGMT_OUT 0x07 /* NCQ ZAC mgmt out no data */ #define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ #define ATA_SFPDMA_DSM 0x00 /* Data set management */ #define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxiliary */ #define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ #define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ #define ATA_SFPDMA_ZAC_MGMT_OUT 0x03 /* NCQ ZAC mgmt out w/data */ #define ATA_RECV_FPDMA_QUEUED 0x65 /* receive DMA NCQ */ #define ATA_RFPDMA_RL_DMA_EXT 0x00 /* Read Log DMA EXT */ #define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ #define ATA_AMAX_ADDR 0x78 /* Accessible Max Address */ #define ATA_AMAX_ADDR_GET 0x00 /* GET NATIVE MAX ADDRESS EXT */ #define ATA_AMAX_ADDR_SET 0x01 /* SET ACCESSIBLE MAX ADDRESS EXT */ #define ATA_AMAX_ADDR_FREEZE 0x02 /* FREEZE ACCESSIBLE MAX ADDRESS EXT */ #define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ #define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ #define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ #define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ #define ATA_ZM_RWP 0x04 /* reset write pointer */ #define ATA_DOWNLOAD_MICROCODE 0x92 /* DOWNLOAD MICROCODE */ #define ATA_DOWNLOAD_MICROCODE_DMA 0x93 /* DOWNLOAD MICROCODE DMA */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ #define ATA_SMART_CMD 0xb0 /* SMART command */ #define ATA_SANITIZE 0xb4 /* sanitize device */ #define ATA_CFA_ERASE 0xc0 /* CFA erase */ #define ATA_READ_MUL 0xc4 /* read multi */ #define ATA_WRITE_MUL 0xc5 /* write multi */ #define ATA_SET_MULTI 0xc6 /* set multi size */ #define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */ #define ATA_READ_DMA 0xc8 /* read DMA */ #define ATA_WRITE_DMA 0xca /* write DMA */ #define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */ #define ATA_WRITE_MUL_FUA48 0xce #define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */ #define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */ #define ATA_STANDBY_CMD 0xe2 /* standby */ #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ #define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_BUFFER 0xe8 /* write buffer */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ #define ATA_READ_BUFFER_DMA 0xe9 /* read buffer DMA */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ #define ATA_WRITE_BUFFER_DMA 0xeb /* write buffer DMA */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ #define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ #define ATA_SF_SETXFER 0x03 /* set transfer mode */ #define ATA_SF_APM 0x05 /* Enable APM feature set */ #define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ #define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ #define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ #define ATA_SF_WRV 0x0b /* Enable Write-Read-Verify */ #define ATA_SF_DLC 0x0c /* Enable device life control */ #define ATA_SF_SATA 0x10 /* Enable use of SATA feature */ #define ATA_SF_FFC 0x41 /* Free-fall Control */ #define ATA_SF_MHIST 0x43 /* Set Max Host Sect. Times */ #define ATA_SF_RATE 0x45 /* Set Rate Basis */ #define ATA_SF_EPC 0x4A /* Extended Power Conditions */ #define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ #define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ #define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ #define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ #define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ #define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ #define ATA_SF_DSN 0x63 /* Device Stats Notification */ #define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ #define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */ #define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */ #define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */ #define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ #define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ /* ATAPI commands */ #define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ #define ATAPI_REZERO 0x01 /* rewind */ #define ATAPI_REQUEST_SENSE 0x03 /* get sense data */ #define ATAPI_FORMAT 0x04 /* format unit */ #define ATAPI_READ 0x08 /* read data */ #define ATAPI_WRITE 0x0a /* write data */ #define ATAPI_WEOF 0x10 /* write filemark */ #define ATAPI_WF_WRITE 0x01 #define ATAPI_SPACE 0x11 /* space command */ #define ATAPI_SP_FM 0x01 #define ATAPI_SP_EOD 0x03 #define ATAPI_INQUIRY 0x12 /* get inquiry data */ #define ATAPI_MODE_SELECT 0x15 /* mode select */ #define ATAPI_ERASE 0x19 /* erase */ #define ATAPI_MODE_SENSE 0x1a /* mode sense */ #define ATAPI_START_STOP 0x1b /* start/stop unit */ #define ATAPI_SS_LOAD 0x01 #define ATAPI_SS_RETENSION 0x02 #define ATAPI_SS_EJECT 0x04 #define ATAPI_PREVENT_ALLOW 0x1e /* media removal */ #define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */ #define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */ #define ATAPI_READ_BIG 0x28 /* read data */ #define ATAPI_WRITE_BIG 0x2a /* write data */ #define ATAPI_LOCATE 0x2b /* locate to position */ #define ATAPI_READ_POSITION 0x34 /* read position */ #define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */ #define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */ #define ATAPI_READ_BUFFER 0x3c /* read device buffer */ #define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */ #define ATAPI_READ_TOC 0x43 /* get table of contents */ #define ATAPI_PLAY_10 0x45 /* play by lba */ #define ATAPI_PLAY_MSF 0x47 /* play by MSF address */ #define ATAPI_PLAY_TRACK 0x48 /* play by track number */ #define ATAPI_PAUSE 0x4b /* pause audio operation */ #define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */ #define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */ #define ATAPI_RESERVE_TRACK 0x53 /* reserve track */ #define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */ #define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */ #define ATAPI_REPAIR_TRACK 0x58 /* repair track */ #define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */ #define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */ #define ATAPI_CLOSE_TRACK 0x5b /* close track/session */ #define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */ #define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */ #define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */ #define ATAPI_BLANK 0xa1 /* blank the media */ #define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */ #define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */ #define ATAPI_PLAY_12 0xa5 /* play by lba */ #define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */ #define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */ #define ATAPI_PLAY_CD 0xb4 /* universal play command */ #define ATAPI_SET_SPEED 0xbb /* set drive speed */ #define ATAPI_MECH_STATUS 0xbd /* get changer status */ #define ATAPI_READ_CD 0xbe /* read data */ #define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ struct ata_ioc_devices { int channel; char name[2][32]; struct ata_params params[2]; }; /* pr channel ATA ioctl calls */ #define IOCATAGMAXCHANNEL _IOR('a', 1, int) #define IOCATAREINIT _IOW('a', 2, int) #define IOCATAATTACH _IOW('a', 3, int) #define IOCATADETACH _IOW('a', 4, int) #define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices) /* ATAPI request sense structure */ struct atapi_sense { u_int8_t error; /* current or deferred errors */ #define ATA_SENSE_VALID 0x80 u_int8_t segment; /* segment number */ u_int8_t key; /* sense key */ #define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */ #define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */ #define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */ #define ATA_SENSE_NOT_READY 0x02 /* no access to drive */ #define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */ #define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */ #define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */ #define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */ #define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */ #define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */ #define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */ #define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */ #define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */ #define ATA_SENSE_EQUAL 0x0c /* equal */ #define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */ #define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */ #define ATA_SENSE_RESERVED 0x0f #define ATA_SENSE_ILI 0x20; #define ATA_SENSE_EOM 0x40; #define ATA_SENSE_FILEMARK 0x80; u_int32_t cmd_info; /* cmd information */ u_int8_t sense_length; /* additional sense len (n-7) */ u_int32_t cmd_specific_info; /* additional cmd spec info */ u_int8_t asc; /* additional sense code */ u_int8_t ascq; /* additional sense code qual */ u_int8_t replaceable_unit_code; /* replaceable unit code */ u_int8_t specific; /* sense key specific */ #define ATA_SENSE_SPEC_VALID 0x80 #define ATA_SENSE_SPEC_MASK 0x7f u_int8_t specific1; /* sense key specific */ u_int8_t specific2; /* sense key specific */ } __packed; /* * SET FEATURES subcommands */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * These values go in the LBA 3:0. */ #define ATA_SF_EPC_RESTORE 0x00 /* Restore Power Condition Settings */ #define ATA_SF_EPC_GOTO 0x01 /* Go To Power Condition */ #define ATA_SF_EPC_SET_TIMER 0x02 /* Set Power Condition Timer */ #define ATA_SF_EPC_SET_STATE 0x03 /* Set Power Condition State */ #define ATA_SF_EPC_ENABLE 0x04 /* Enable the EPC feature set */ #define ATA_SF_EPC_DISABLE 0x05 /* Disable the EPC feature set */ #define ATA_SF_EPC_SET_SOURCE 0x06 /* Set EPC Power Source */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Power Condition ID field * These values go in the count register. */ #define ATA_EPC_STANDBY_Z 0x00 /* Substate of PM2:Standby */ #define ATA_EPC_STANDBY_Y 0x01 /* Substate of PM2:Standby */ #define ATA_EPC_IDLE_A 0x81 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_B 0x82 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_C 0x83 /* Substate of PM1:Idle */ #define ATA_EPC_ALL 0xff /* All supported power conditions */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Restore Power Conditions Settings subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_RST_DFLT 0x40 /* 1=Rst from Default, 0= from Saved */ #define ATA_SF_EPC_RST_SAVE 0x10 /* 1=Save on completion */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Got To Power Condition subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_GOTO_DELAY 0x02000000 /* Delayed entry bit */ #define ATA_SF_EPC_GOTO_HOLD 0x01000000 /* Hold Power Cond bit */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition Timer subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_TIMER_MASK 0x00ffff00 /* Timer field */ #define ATA_SF_EPC_TIMER_SHIFT 8 #define ATA_SF_EPC_TIMER_SEC 0x00000080 /* Timer units, 1=sec, 0=.1s */ #define ATA_SF_EPC_TIMER_EN 0x00000020 /* Enable/disable cond. */ #define ATA_SF_EPC_TIMER_SAVE 0x00000010 /* Save settings on comp. */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition State subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_SETCON_EN 0x00000020 /* Enable power cond. */ #define ATA_SF_EPC_SETCON_SAVE 0x00000010 /* Save settings on comp */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set EPC Power Source subcommand * These values go in the count register. */ #define ATA_SF_EPC_SRC_UNKNOWN 0x0000 /* Unknown source */ #define ATA_SF_EPC_SRC_BAT 0x0001 /* battery source */ #define ATA_SF_EPC_SRC_NOT_BAT 0x0002 /* not battery source */ #define ATA_LOG_DIRECTORY 0x00 /* Directory of all logs */ #define ATA_POWER_COND_LOG 0x08 /* Power Conditions Log */ #define ATA_PCL_IDLE 0x00 /* Idle Power Conditions Page */ #define ATA_PCL_STANDBY 0x01 /* Standby Power Conditions Page */ #define ATA_IDENTIFY_DATA_LOG 0x30 /* Identify Device Data Log */ #define ATA_IDL_PAGE_LIST 0x00 /* List of supported pages */ #define ATA_IDL_IDENTIFY_DATA 0x01 /* Copy of Identify Device data */ #define ATA_IDL_CAPACITY 0x02 /* Capacity */ #define ATA_IDL_SUP_CAP 0x03 /* Supported Capabilities */ #define ATA_IDL_CUR_SETTINGS 0x04 /* Current Settings */ #define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ #define ATA_IDL_SECURITY 0x06 /* Security */ #define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ #define ATA_IDL_SERIAL_ATA 0x08 /* Serial ATA */ #define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ struct ata_gp_log_dir { uint8_t header[2]; #define ATA_GP_LOG_DIR_VERSION 0x0001 uint8_t num_pages[255*2]; /* Number of log pages at address */ }; /* * ATA Power Conditions log descriptor */ struct ata_power_cond_log_desc { uint8_t reserved1; uint8_t flags; #define ATA_PCL_COND_SUPPORTED 0x80 #define ATA_PCL_COND_SAVEABLE 0x40 #define ATA_PCL_COND_CHANGEABLE 0x20 #define ATA_PCL_DEFAULT_TIMER_EN 0x10 #define ATA_PCL_SAVED_TIMER_EN 0x08 #define ATA_PCL_CURRENT_TIMER_EN 0x04 #define ATA_PCL_HOLD_PC_NOT_SUP 0x02 uint8_t reserved2[2]; uint8_t default_timer[4]; uint8_t saved_timer[4]; uint8_t current_timer[4]; uint8_t nom_time_to_active[4]; uint8_t min_timer[4]; uint8_t max_timer[4]; uint8_t num_transitions_to_pc[4]; uint8_t hours_in_pc[4]; uint8_t reserved3[28]; }; /* * ATA Power Conditions Log (0x08), Idle power conditions page (0x00) */ struct ata_power_cond_log_idle { struct ata_power_cond_log_desc idle_a_desc; struct ata_power_cond_log_desc idle_b_desc; struct ata_power_cond_log_desc idle_c_desc; uint8_t reserved[320]; }; /* * ATA Power Conditions Log (0x08), Standby power conditions page (0x01) */ struct ata_power_cond_log_standby { uint8_t reserved[384]; struct ata_power_cond_log_desc standby_y_desc; struct ata_power_cond_log_desc standby_z_desc; }; /* * ATA IDENTIFY DEVICE data log (0x30) page 0x00 * List of Supported IDENTIFY DEVICE data pages. */ struct ata_identify_log_pages { uint8_t header[8]; #define ATA_IDLOG_REVISION 0x0000000000000001 uint8_t entry_count; uint8_t entries[503]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Capacity (Page 0x02). */ struct ata_identify_log_capacity { uint8_t header[8]; #define ATA_CAP_HEADER_VALID 0x8000000000000000 #define ATA_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_CAP_PAGE_NUM_SHIFT 16 #define ATA_CAP_REV_MASK 0x00000000000000ff uint8_t capacity[8]; #define ATA_CAP_CAPACITY_VALID 0x8000000000000000 #define ATA_CAP_ACCESSIBLE_CAP 0x0000ffffffffffff uint8_t phys_logical_sect_size[8]; #define ATA_CAP_PL_VALID 0x8000000000000000 #define ATA_CAP_LTOP_REL_SUP 0x4000000000000000 #define ATA_CAP_LOG_SECT_SUP 0x2000000000000000 #define ATA_CAP_ALIGN_ERR_MASK 0x0000000000300000 #define ATA_CAP_LTOP_MASK 0x00000000000f0000 #define ATA_CAP_LOG_SECT_OFF 0x000000000000ffff uint8_t logical_sect_size[8]; #define ATA_CAP_LOG_SECT_VALID 0x8000000000000000 #define ATA_CAP_LOG_SECT_SIZE 0x00000000ffffffff uint8_t nominal_buffer_size[8]; #define ATA_CAP_NOM_BUF_VALID 0x8000000000000000 #define ATA_CAP_NOM_BUF_SIZE 0x7fffffffffffffff uint8_t reserved[472]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Supported Capabilities (Page 0x03). */ struct ata_identify_log_sup_cap { uint8_t header[8]; #define ATA_SUP_CAP_HEADER_VALID 0x8000000000000000 #define ATA_SUP_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_SUP_CAP_PAGE_NUM_SHIFT 16 #define ATA_SUP_CAP_REV_MASK 0x00000000000000ff uint8_t sup_cap[8]; #define ATA_SUP_CAP_VALID 0x8000000000000000 #define ATA_SC_SET_SECT_CONFIG_SUP 0x0002000000000000 /* Set Sect Conf*/ #define ATA_SC_ZERO_EXT_SUP 0x0001000000000000 /* Zero EXT */ #define ATA_SC_SUCC_NCQ_SENSE_SUP 0x0000800000000000 /* Succ. NCQ Sns */ #define ATA_SC_DLC_SUP 0x0000400000000000 /* DLC */ #define ATA_SC_RQSN_DEV_FAULT_SUP 0x0000200000000000 /* Req Sns Dev Flt*/ #define ATA_SC_DSN_SUP 0x0000100000000000 /* DSN */ #define ATA_SC_LP_STANDBY_SUP 0x0000080000000000 /* LP Standby */ #define ATA_SC_SET_EPC_PS_SUP 0x0000040000000000 /* Set EPC PS */ #define ATA_SC_AMAX_ADDR_SUP 0x0000020000000000 /* AMAX Addr */ #define ATA_SC_DRAT_SUP 0x0000008000000000 /* DRAT */ #define ATA_SC_LPS_MISALGN_SUP 0x0000004000000000 /* LPS Misalign */ #define ATA_SC_RB_DMA_SUP 0x0000001000000000 /* Read Buf DMA */ #define ATA_SC_WB_DMA_SUP 0x0000000800000000 /* Write Buf DMA */ #define ATA_SC_DNLD_MC_DMA_SUP 0x0000000200000000 /* DL MCode DMA */ #define ATA_SC_28BIT_SUP 0x0000000100000000 /* 28-bit */ #define ATA_SC_RZAT_SUP 0x0000000080000000 /* RZAT */ #define ATA_SC_NOP_SUP 0x0000000020000000 /* NOP */ #define ATA_SC_READ_BUFFER_SUP 0x0000000010000000 /* Read Buffer */ #define ATA_SC_WRITE_BUFFER_SUP 0x0000000008000000 /* Write Buffer */ #define ATA_SC_READ_LOOK_AHEAD_SUP 0x0000000002000000 /* Read Look-Ahead*/ #define ATA_SC_VOLATILE_WC_SUP 0x0000000001000000 /* Volatile WC */ #define ATA_SC_SMART_SUP 0x0000000000800000 /* SMART */ #define ATA_SC_FLUSH_CACHE_EXT_SUP 0x0000000000400000 /* Flush Cache Ext */ #define ATA_SC_48BIT_SUP 0x0000000000100000 /* 48-Bit */ #define ATA_SC_SPINUP_SUP 0x0000000000040000 /* Spin-Up */ #define ATA_SC_PUIS_SUP 0x0000000000020000 /* PUIS */ #define ATA_SC_APM_SUP 0x0000000000010000 /* APM */ #define ATA_SC_DL_MICROCODE_SUP 0x0000000000004000 /* DL Microcode */ #define ATA_SC_UNLOAD_SUP 0x0000000000002000 /* Unload */ #define ATA_SC_WRITE_FUA_EXT_SUP 0x0000000000001000 /* Write FUA EXT */ #define ATA_SC_GPL_SUP 0x0000000000000800 /* GPL */ #define ATA_SC_STREAMING_SUP 0x0000000000000400 /* Streaming */ #define ATA_SC_SMART_SELFTEST_SUP 0x0000000000000100 /* SMART self-test */ #define ATA_SC_SMART_ERR_LOG_SUP 0x0000000000000080 /* SMART Err Log */ #define ATA_SC_EPC_SUP 0x0000000000000040 /* EPC */ #define ATA_SC_SENSE_SUP 0x0000000000000020 /* Sense data */ #define ATA_SC_FREEFALL_SUP 0x0000000000000010 /* Free-Fall */ #define ATA_SC_DM_MODE3_SUP 0x0000000000000008 /* DM Mode 3 */ #define ATA_SC_GPL_DMA_SUP 0x0000000000000004 /* GPL DMA */ #define ATA_SC_WRITE_UNCOR_SUP 0x0000000000000002 /* Write uncorr. */ #define ATA_SC_WRV_SUP 0x0000000000000001 /* WRV */ uint8_t download_code_cap[8]; #define ATA_DL_CODE_VALID 0x8000000000000000 #define ATA_DLC_DM_OFFSETS_DEFER_SUP 0x0000000400000000 #define ATA_DLC_DM_IMMED_SUP 0x0000000200000000 #define ATA_DLC_DM_OFF_IMMED_SUP 0x0000000100000000 #define ATA_DLC_DM_MAX_XFER_SIZE_MASK 0x00000000ffff0000 #define ATA_DLC_DM_MAX_XFER_SIZE_SHIFT 16 #define ATA_DLC_DM_MIN_XFER_SIZE_MASK 0x000000000000ffff uint8_t nom_media_rotation_rate[8]; #define ATA_NOM_MEDIA_ROTATION_VALID 0x8000000000000000 #define ATA_ROTATION_MASK 0x000000000000ffff uint8_t form_factor[8]; #define ATA_FORM_FACTOR_VALID 0x8000000000000000 #define ATA_FF_MASK 0x000000000000000f #define ATA_FF_NOT_REPORTED 0x0000000000000000 /* Not reported */ #define ATA_FF_525_IN 0x0000000000000001 /* 5.25 inch */ #define ATA_FF_35_IN 0x0000000000000002 /* 3.5 inch */ #define ATA_FF_25_IN 0x0000000000000003 /* 2.5 inch */ #define ATA_FF_18_IN 0x0000000000000004 /* 1.8 inch */ #define ATA_FF_LT_18_IN 0x0000000000000005 /* < 1.8 inch */ #define ATA_FF_MSATA 0x0000000000000006 /* mSATA */ #define ATA_FF_M2 0x0000000000000007 /* M.2 */ #define ATA_FF_MICROSSD 0x0000000000000008 /* MicroSSD */ #define ATA_FF_CFAST 0x0000000000000009 /* CFast */ uint8_t wrv_sec_cnt_mode3[8]; #define ATA_WRV_MODE3_VALID 0x8000000000000000 #define ATA_WRV_MODE3_COUNT 0x00000000ffffffff uint8_t wrv_sec_cnt_mode2[8]; #define ATA_WRV_MODE2_VALID 0x8000000000000000 #define ATA_WRV_MODE2_COUNT 0x00000000ffffffff uint8_t wwn[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t dsm[8]; #define ATA_DSM_VALID 0x8000000000000000 #define ATA_LB_MARKUP_SUP 0x000000000000ff00 #define ATA_TRIM_SUP 0x0000000000000001 uint8_t util_per_unit_time[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t util_usage_rate_sup[8]; #define ATA_UTIL_USAGE_RATE_VALID 0x8000000000000000 #define ATA_SETTING_RATE_SUP 0x0000000000800000 #define ATA_SINCE_POWERON_SUP 0x0000000000000100 #define ATA_POH_RATE_SUP 0x0000000000000010 #define ATA_DATE_TIME_RATE_SUP 0x0000000000000001 uint8_t zoned_cap[8]; #define ATA_ZONED_VALID 0x8000000000000000 #define ATA_ZONED_MASK 0x0000000000000003 uint8_t sup_zac_cap[8]; #define ATA_SUP_ZAC_CAP_VALID 0x8000000000000000 #define ATA_ND_RWP_SUP 0x0000000000000010 /* Reset Write Ptr*/ #define ATA_ND_FINISH_ZONE_SUP 0x0000000000000008 /* Finish Zone */ #define ATA_ND_CLOSE_ZONE_SUP 0x0000000000000004 /* Close Zone */ #define ATA_ND_OPEN_ZONE_SUP 0x0000000000000002 /* Open Zone */ #define ATA_REPORT_ZONES_SUP 0x0000000000000001 /* Report Zones */ uint8_t reserved[392]; }; /* * ATA Identify Device Data Log Zoned Device Information Page (0x09). * Current as of ZAC r04a, August 25, 2015. */ struct ata_zoned_info_log { uint8_t header[8]; #define ATA_ZDI_HEADER_VALID 0x8000000000000000 #define ATA_ZDI_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_ZDI_PAGE_NUM_SHIFT 16 #define ATA_ZDI_REV_MASK 0x00000000000000ff uint8_t zoned_cap[8]; #define ATA_ZDI_CAP_VALID 0x8000000000000000 #define ATA_ZDI_CAP_URSWRZ 0x0000000000000001 uint8_t zoned_settings[8]; #define ATA_ZDI_SETTINGS_VALID 0x8000000000000000 uint8_t optimal_seq_zones[8]; #define ATA_ZDI_OPT_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_OPT_SEQ_MASK 0x00000000ffffffff uint8_t optimal_nonseq_zones[8]; #define ATA_ZDI_OPT_NS_VALID 0x8000000000000000 #define ATA_ZDI_OPT_NS_MASK 0x00000000ffffffff uint8_t max_seq_req_zones[8]; #define ATA_ZDI_MAX_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_MAX_SEQ_MASK 0x00000000ffffffff uint8_t version_info[8]; #define ATA_ZDI_VER_VALID 0x8000000000000000 #define ATA_ZDI_VER_ZAC_SUP 0x0100000000000000 #define ATA_ZDI_VER_ZAC_MASK 0x00000000000000ff uint8_t reserved[456]; }; struct ata_ioc_request { union { struct { u_int8_t command; u_int8_t feature; u_int64_t lba; u_int16_t count; } ata; struct { char ccb[16]; struct atapi_sense sense; } atapi; } u; caddr_t data; int count; int flags; #define ATA_CMD_CONTROL 0x01 #define ATA_CMD_READ 0x02 #define ATA_CMD_WRITE 0x04 #define ATA_CMD_ATAPI 0x08 int timeout; int error; }; struct ata_security_password { u_int16_t ctrl; #define ATA_SECURITY_PASSWORD_USER 0x0000 #define ATA_SECURITY_PASSWORD_MASTER 0x0001 #define ATA_SECURITY_ERASE_NORMAL 0x0000 #define ATA_SECURITY_ERASE_ENHANCED 0x0002 #define ATA_SECURITY_LEVEL_HIGH 0x0000 #define ATA_SECURITY_LEVEL_MAXIMUM 0x0100 u_int8_t password[32]; u_int16_t revision; u_int16_t reserved[238]; }; /* pr device ATA ioctl calls */ #define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request) #define IOCATAGPARM _IOR('a', 101, struct ata_params) #define IOCATAGMODE _IOR('a', 102, int) #define IOCATASMODE _IOW('a', 103, int) #define IOCATAGSPINDOWN _IOR('a', 104, int) #define IOCATASSPINDOWN _IOW('a', 105, int) struct ata_ioc_raid_config { int lun; int type; #define AR_JBOD 0x0001 #define AR_SPAN 0x0002 #define AR_RAID0 0x0004 #define AR_RAID1 0x0008 #define AR_RAID01 0x0010 #define AR_RAID3 0x0020 #define AR_RAID4 0x0040 #define AR_RAID5 0x0080 int interleave; int status; #define AR_READY 1 #define AR_DEGRADED 2 #define AR_REBUILDING 4 int progress; int total_disks; int disks[16]; }; struct ata_ioc_raid_status { int lun; int type; int interleave; int status; int progress; int total_disks; struct { int state; #define AR_DISK_ONLINE 0x01 #define AR_DISK_PRESENT 0x02 #define AR_DISK_SPARE 0x04 int lun; } disks[16]; }; /* ATA RAID ioctl calls */ #define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config) #define IOCATARAIDDELETE _IOW('a', 201, int) #define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status) #define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config) #define IOCATARAIDREBUILD _IOW('a', 204, int) #endif /* _SYS_ATA_H_ */ Index: projects/fuse2/sys/vm/swap_pager.c =================================================================== --- projects/fuse2/sys/vm/swap_pager.c (revision 350434) +++ projects/fuse2/sys/vm/swap_pager.c (revision 350435) @@ -1,3004 +1,2991 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif #define SWAP_META_PAGES PCTRIE_COUNT /* * A swblk structure maps each page index within a * SWAP_META_PAGES-aligned and sized range to the address of an * on-disk swap block (or SWAPBLK_NONE). The collection of these * mappings for an entire vm object is implemented as a pc-trie. */ struct swblk { vm_pindex_t p; daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static u_long swap_reserved; static u_long swap_total; static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_total, 0, sysctl_page_shift, "A", "Total amount of available swap storage."); static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { uint64_t newval; u_long value = *(u_long *)arg1; newval = ((uint64_t)value) << PAGE_SHIFT; return (sysctl_handle_64(oidp, &newval, 0, req)); } int swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { u_long r, s, prev, pincr; int res, error; static int curfail; static struct timeval lastfail; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (0); } #endif pincr = atop(incr); res = 0; prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; } else { prev = atomic_fetchadd_long(&swap_reserved, -pincr); if (prev < pincr) panic("swap_reserved < incr on overcommit fail"); } if (res) { prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) { res = 0; prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); if (prev < pincr) panic("uip->ui_vmsize < incr on overcommit fail"); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", uip->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (res); } void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; u_long pincr; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); PROC_LOCK(curproc); #ifdef RACCT if (racct_enable) racct_add_force(curproc, RACCT_SWAP, incr); #endif pincr = atop(incr); atomic_add_long(&swap_reserved, pincr); uip = curproc->p_ucred->cr_ruidinfo; atomic_add_long(&uip->ui_vmsize, pincr); PROC_UNLOCK(curproc); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { u_long prev, pdecr; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, (uintmax_t)decr)); pdecr = atop(decr); prev = atomic_fetchadd_long(&swap_reserved, -pdecr); if (prev < pdecr) panic("swap_reserved < decr"); prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); if (prev < pdecr) printf("negative vmsize for uid = %d\n", uip->ui_uid); #ifdef RACCT if (racct_enable) racct_sub_cred(cred, RACCT_SWAP, decr); #endif } #define SWM_POP 0x01 /* pop out */ static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", "Swap Fragmentation Info"); static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swwbuf_zone; static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ }; /* * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, "Maximum size of a swap block in pages"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); static daddr_t swp_pager_getswapspace(int *npages, int limit); /* * Metadata functions */ static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); static void swp_pager_init_freerange(daddr_t *start, daddr_t *num) { *start = SWAPBLK_NONE; *num = 0; } static void swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) { if (*start + *num == addr) { (*num)++; } else { swp_pager_freeswapspace(*start, *num); *start = addr; *num = 1; } } static void * swblk_trie_alloc(struct pctrie *ptree) { return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0))); } static void swblk_trie_free(struct pctrie *ptree, void *node) { uma_zfree(swpctrie_zone, node); } PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array, which has MAXPHYS / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or * estimating the number we need based on the number of pages * in the system. */ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : vm_cnt.v_page_count / 2; swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; do { if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); /* * Often uma_zone_reserve_kva() cannot reserve exactly the * requested size. Account for the difference when * calculating swap_maxpages. */ n = uma_zone_get_max(swblk_zone); if (n < n2) printf("Swap blk zone entries changed from %lu to %lu.\n", n2, n); swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblk); if (!uma_zone_reserve_kva(swpctrie_zone, n)) printf("Cannot reserve swap pctrie zone, " "reduce kern.maxswzone.\n"); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } /* * The un_pager.swp.swp_blks trie is initialized by * vm_object_allocate() to ensure the correct order of * visibility to other threads. */ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for up to the requested number of pages, and at * least a minimum number of pages. The starting swap block number * (a page index) is returned or SWAPBLK_NONE if the allocation * failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int *io_npages, int limit) { daddr_t blk; struct swdevt *sp; int mpages, npages; blk = SWAPBLK_NONE; mpages = *io_npages; npages = imin(BLIST_MAX_ALLOC, mpages); mtx_lock(&sw_dev_mtx); sp = swdevhd; while (!TAILQ_EMPTY(&swtailq)) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) blk = blist_alloc(sp->sw_blist, &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); if (swdevhd == sp) { if (npages <= limit) break; mpages = npages - 1; npages >>= 1; } } if (blk != SWAPBLK_NONE) { *io_npages = npages; blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { if (swap_pager_full != 2) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; } mtx_unlock(&sw_dev_mtx); return (blk); } static bool swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(bp->b_blkno, sp)) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages) { struct swdevt *sp; if (npages == 0) return; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(blk, sp)) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats */ static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct swdevt *sp; const char *devname; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); blist_stats(sp->sw_blist, &sbuf); } mtx_unlock(&sw_dev_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { daddr_t addr, blk, n_free, s_free; int i, j, n; swp_pager_init_freerange(&s_free, &n_free); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = size - i; blk = swp_pager_getswapspace(&n, 1); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); VM_OBJECT_WUNLOCK(object); return (-1); } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); } } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WUNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; daddr_t dstaddr, n_free, s_free, srcaddr; VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && srcobject->handle != NULL) { vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } /* * Transfer source to destination. */ swp_pager_init_freerange(&s_free, &n_free); for (i = 0; i < dstobject->size; ++i) { srcaddr = swp_pager_meta_ctl(srcobject, i + offset, SWM_POP); if (srcaddr == SWAPBLK_NONE) continue; dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr != SWAPBLK_NONE) { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the source block. */ swp_pager_update_freerange(&s_free, &n_free, srcaddr); continue; } /* * Destination has no swapblk and is not resident, * copy source. * * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); dstaddr = swp_pager_meta_build(dstobject, i, srcaddr); KASSERT(dstaddr == SWAPBLK_NONE, ("Unexpected destination swapblk")); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } swp_pager_freeswapspace(s_free, n_free); /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page must be locked. */ static void swap_pager_unswapped(vm_page_t m) { daddr_t srcaddr; srcaddr = swp_pager_meta_ctl(m->object, m->pindex, SWM_POP); if (srcaddr != SWAPBLK_NONE) swp_pager_freeswapspace(srcaddr, 1); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "ma" of length "count". The * caller may optionally specify that additional pages preceding and * succeeding the specified range be paged in. The number of such pages * is returned in the "rbehind" and "rahead" parameters, and they will * be in the inactive queue upon return. * * The pages in "ma" must be busied and will remain busied upon return. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t bm, mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, maxahead, maxbehind, reqcount; reqcount = count; /* * Determine the final number of read-behind pages and * allocate them BEFORE releasing the object lock. Otherwise, * there can be a problematic race with vm_object_split(). * Specifically, vm_object_split() might first transfer pages * that precede ma[0] in the current object to a new object, * and then this function incorrectly recreates those pages as * read-behind pages in the current object. */ if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) return (VM_PAGER_FAIL); /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = ma[reqcount - 1]->pindex; msucc = TAILQ_NEXT(ma[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = ma[0]->pindex; mpred = TAILQ_PREV(ma[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } bm = ma[0]; for (i = 0; i < count; i++) ma[i]->oflags |= VPO_SWAPINPROG; /* * Allocate readahead and readbehind pages. */ if (rbehind != NULL) { for (i = 1; i <= *rbehind; i++) { p = vm_page_alloc(object, ma[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; bm = p; } *rbehind = i - 1; } if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); pindex = bm->pindex; blk = swp_pager_meta_ctl(object, pindex, 0); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); bp->b_pages[i] = p; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; VM_CNT_INC(v_swapin); VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { ma[0]->oflags |= VPO_SWAPSLEEP; VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (ma[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, ma, count, rbehind, rahead); VM_OBJECT_WUNLOCK(object); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, ma, count, error); VM_OBJECT_WLOCK(object); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy - * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. + * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { - int i, n; - boolean_t sync; - daddr_t addr, n_free, s_free; + struct buf *bp; + daddr_t addr, blk, n_free, s_free; + vm_page_t mreq; + int i, j, n; + bool async; - swp_pager_init_freerange(&s_free, &n_free); - if (count && ma[0]->object != object) { - panic("swap_pager_putpages: object mismatch %p/%p", - object, - ma[0]->object - ); - } + KASSERT(count == 0 || ma[0]->object == object, + ("%s: object mismatch %p/%p", + __func__, object, ma[0]->object)); /* * Step 1 * - * Turn object into OBJT_SWAP - * check for bogus sysops - * force sync if not pageout process + * Turn object into OBJT_SWAP. Force sync if not a pageout process. */ if (object->type != OBJT_SWAP) { addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE); KASSERT(addr == SWAPBLK_NONE, ("unexpected object swap block")); } VM_OBJECT_WUNLOCK(object); + async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; + swp_pager_init_freerange(&s_free, &n_free); - n = 0; - if (curproc != pageproc) - sync = TRUE; - else - sync = (flags & VM_PAGER_PUT_SYNC) != 0; - /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { - int j; - struct buf *bp; - daddr_t blk; - /* Maximum I/O size is limited by maximum swap block size. */ n = min(count - i, nsw_cluster_max); /* Get a block of swap of size up to size n. */ blk = swp_pager_getswapspace(&n, 4); if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) - rtvals[i+j] = VM_PAGER_FAIL; + rtvals[i + j] = VM_PAGER_FAIL; continue; } /* - * All I/O parameters have been satisfied, build the I/O + * All I/O parameters have been satisfied. Build the I/O * request and assign the swap space. */ - if (sync != TRUE) { + if (async) { mtx_lock(&swbuf_mtx); while (nsw_wcount_async == 0) msleep(&nsw_wcount_async, &swbuf_mtx, PVM, "swbufa", 0); nsw_wcount_async--; mtx_unlock(&swbuf_mtx); } bp = uma_zalloc(swwbuf_zone, M_WAITOK); - if (sync != TRUE) + if (async) bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { - vm_page_t mreq = ma[i+j]; - + mreq = ma[i + j]; addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_WUNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; VM_CNT_INC(v_swapout); VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * - * NOTE: b_blkno is destroyed by the call to swapdev_strategy + * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ - if (sync == FALSE) { + if (async) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * - * NOTE: b_blkno is destroyed by the call to swapdev_strategy + * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } - VM_OBJECT_WLOCK(object); swp_pager_freeswapspace(s_free, n_free); + VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * Report error - unless we ran out of memory, in which case * we've already logged it in swapgeom_strategy(). */ if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->paging_in_progress); } if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); m->valid = VM_PAGE_BITS_ALL; if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_lock(m); vm_page_deactivate_noreuse(m); vm_page_unlock(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ if (bp->b_flags & B_ASYNC) { mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); } uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int swap_pager_nswapdev(void) { return (nswapdev); } static void swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); #ifdef INVARIANTS vm_page_lock(m); if (!vm_page_wired(m) && m->queue == PQ_NONE) panic("page %p is neither wired nor queued", m); vm_page_unlock(m); #endif vm_page_xunbusy(m); swap_pager_unswapped(m); } static void swp_pager_force_launder(vm_page_t m) { vm_page_dirty(m); vm_page_lock(m); vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); swap_pager_unswapped(m); } /* * SWP_PAGER_FORCE_PAGEIN() - force swap blocks to be paged in * * This routine dissociates pages starting at the given index within an * object from their backing store, paging them in if they do not reside * in memory. Pages that are paged in are marked dirty and placed in the * laundry queue. Pages are marked dirty because they no longer have * backing store. They are placed in the laundry queue because they have * not been accessed recently. Otherwise, they would already reside in * memory. */ static void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex, int npages) { vm_page_t ma[npages]; int i, j; KASSERT(npages > 0, ("%s: No pages", __func__)); KASSERT(npages <= MAXPHYS / PAGE_SIZE, ("%s: Too many pages: %d", __func__, npages)); vm_object_pip_add(object, npages); vm_page_grab_pages(object, pindex, VM_ALLOC_NORMAL, ma, npages); for (i = j = 0;; i++) { /* Count nonresident pages, to page-in all at once. */ if (i < npages && ma[i]->valid != VM_PAGE_BITS_ALL) continue; if (j < i) { /* Page-in nonresident pages. Mark for laundering. */ if (swap_pager_getpages(object, &ma[j], i - j, NULL, NULL) != VM_PAGER_OK) panic("%s: read from swap failed", __func__); do { swp_pager_force_launder(ma[j]); } while (++j < i); } if (i == npages) break; /* Mark dirty a resident page. */ swp_pager_force_dirty(ma[j++]); } vm_object_pip_wakeupn(object, npages); } /* * swap_pager_swapoff_object: * * Page in all of the pages that have been paged out for an object * to a swap device. */ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { struct swblk *sb; vm_pindex_t pi, s_pindex; daddr_t blk, n_blks, s_blk; int i; n_blks = 0; for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; ) { for (i = 0; i < SWAP_META_PAGES; i++) { blk = sb->d[i]; if (!swp_pager_isondev(blk, sp)) blk = SWAPBLK_NONE; /* * If there are no blocks/pages accumulated, start a new * accumulation here. */ if (n_blks == 0) { if (blk != SWAPBLK_NONE) { s_blk = blk; s_pindex = sb->p + i; n_blks = 1; } continue; } /* * If the accumulation can be extended without breaking * the sequence of consecutive blocks and pages that * swp_pager_force_pagein() depends on, do so. */ if (n_blks < MAXPHYS / PAGE_SIZE && s_blk + n_blks == blk && s_pindex + n_blks == sb->p + i) { ++n_blks; continue; } /* * The sequence of consecutive blocks and pages cannot * be extended, so page them all in here. Then, * because doing so involves releasing and reacquiring * a lock that protects the swap block pctrie, do not * rely on the current swap block. Break this loop and * re-fetch the same pindex from the pctrie again. */ swp_pager_force_pagein(object, s_pindex, n_blks); n_blks = 0; break; } if (i == SWAP_META_PAGES) pi = sb->p + SWAP_META_PAGES; } if (n_blks > 0) swp_pager_force_pagein(object, s_pindex, n_blks); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { vm_object_t object; int retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; full_rescan: mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->type != OBJT_SWAP) continue; mtx_unlock(&vm_object_list_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); /* * Dead objects are eventually terminated on their own. */ if ((object->flags & OBJ_DEAD) != 0) goto next_obj; /* * Sync with fences placed after pctrie * initialization. We must not access pctrie below * unless we checked that our object is swap and not * dead. */ atomic_thread_fence_acq(); if (object->type != OBJT_SWAP) goto next_obj; swap_pager_swapoff_object(sp, object); next_obj: VM_OBJECT_WUNLOCK(object); mtx_lock(&vm_object_list_mtx); } mtx_unlock(&vm_object_list_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? */ static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit) { int i; MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); for (i = start; i < limit; i++) { if (sb->d[i] != SWAPBLK_NONE) return (false); } return (true); } /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is returned. */ static daddr_t swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; struct swblk *sb, *sb1; vm_pindex_t modpi, rdpi; daddr_t prev_swapblk; int error, i; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff()'s iteration over * object_list does not see a garbage pctrie. */ atomic_thread_fence_rel(); object->type = OBJT_SWAP; KASSERT(object->handle == NULL, ("default pager with handle")); } rdpi = rounddown(pindex, SWAP_META_PAGES); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb == NULL) { if (swapblk == SWAPBLK_NONE) return (SWAPBLK_NONE); for (;;) { sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (sb != NULL) { sb->p = rdpi; for (i = 0; i < SWAP_META_PAGES; i++) sb->d[i] = SWAPBLK_NONE; if (atomic_cmpset_int(&swblk_zone_exhausted, 1, 0)) printf("swblk zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swblk_zone)) { if (atomic_cmpset_int(&swblk_zone_exhausted, 0, 1)) printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxb", 10); } else uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb != NULL) /* * Somebody swapped out a nearby page, * allocating swblk at the rdpi index, * while we dropped the object lock. */ goto allocated; } for (;;) { error = SWAP_PCTRIE_INSERT( &object->un_pager.swp.swp_blks, sb); if (error == 0) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 1, 0)) printf("swpctrie zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swpctrie_zone)) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 0, 1)) printf("swap pctrie zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxp", 10); } else uma_zwait(swpctrie_zone); VM_OBJECT_WLOCK(object); sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb1 != NULL) { uma_zfree(swblk_zone, sb); sb = sb1; goto allocated; } } } allocated: MPASS(sb->p == rdpi); modpi = pindex % SWAP_META_PAGES; /* Return prior contents of metadata. */ prev_swapblk = sb->d[modpi]; /* Enter block into metadata. */ sb->d[modpi] = swapblk; /* * Free the swblk if we end up with the empty page run. */ if (swapblk == SWAPBLK_NONE && swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rdpi); uma_zfree(swblk_zone, sb); } return (prev_swapblk); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t last; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP || count == 0) return; swp_pager_init_freerange(&s_free, &n_free); last = pindex + count; for (;;) { sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL || sb->p >= last) break; start = pindex > sb->p ? pindex - sb->p : 0; limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; if (swp_pager_swblk_empty(sb, 0, start) && swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; swp_pager_init_freerange(&s_free, &n_free); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_METACTL() - misc control of swap meta data. * * This routine is capable of looking up, or removing swapblk * assignments in the swap meta data. It returns the swapblk being * looked-up, popped, or SWAPBLK_NONE if the block was invalid. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * SWM_POP remove from meta data but do not free it */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblk *sb; daddr_t r1; if ((flags & SWM_POP) != 0) VM_OBJECT_ASSERT_WLOCKED(object); else VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (SWAPBLK_NONE); r1 = sb->d[pindex % SWAP_META_PAGES]; if (r1 == SWAPBLK_NONE) return (SWAPBLK_NONE); if ((flags & SWM_POP) != 0) { sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE; if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); uma_zfree(swblk_zone, sb); } } return (r1); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; int i; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP) return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); if (sb->p < pindex) { for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, roundup(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); } for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } /* * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ MPASS(0); return (object->size); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swblk_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message. */ static void swapon_check_swzone(void) { unsigned long maxpages, npages; npages = swap_total; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES; /* recommend using no more than half that amount */ if (npages > maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", npages, maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); } } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf( "WARNING: reducing swap size to maximum of %luMB per unit\n", mblocks / 1024 / 1024 * PAGE_SIZE); nblks = mblocks; } sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, 2, nblks - 2); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - 2; swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp, NULL)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } #if defined(COMPAT_FREEBSD11) #define XSWDEV_VERSION_11 1 struct xswdev11 { u_int xsw_version; uint32_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 { u_int xsw_version; u_int xsw_dev1, xsw_dev2; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 xs32; #endif #if defined(COMPAT_FREEBSD11) struct xswdev11 xs11; #endif int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_FREEBSD32) if (req->oldlen == sizeof(xs32)) { xs32.xsw_version = XSWDEV_VERSION; xs32.xsw_dev1 = xs.xsw_dev; xs32.xsw_dev2 = xs.xsw_dev >> 32; xs32.xsw_flags = xs.xsw_flags; xs32.xsw_nblks = xs.xsw_nblks; xs32.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); return (error); } #endif #if defined(COMPAT_FREEBSD11) if (req->oldlen == sizeof(xs11)) { xs11.xsw_version = XSWDEV_VERSION_11; xs11.xsw_dev = xs.xsw_dev; /* truncation */ xs11.xsw_flags = xs.xsw_flags; xs11.xsw_nblks = xs.xsw_nblks; xs11.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); return (error); } #endif error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * Count the approximate swap usage in pages for a vmspace. The * shadowed or not yet copied on write swap blocks are not accounted. * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; struct swblk *sb; vm_pindex_t e, pi; long count; int i; map = &vmspace->vm_map; count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; object = cur->object.vm_object; if (object == NULL || object->type != OBJT_SWAP) continue; VM_OBJECT_RLOCK(object); if (object->type != OBJT_SWAP) goto unlock; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); for (;; pi = sb->p + SWAP_META_PAGES) { sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi); if (sb == NULL || sb->p >= e) break; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE) count++; } } unlock: VM_OBJECT_RUNLOCK(object); } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; printf("swap_pager: cannot allocate bio\n"); bufdone(bp); return; } bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp, 0); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&swbuf_mtx); return (0); } Index: projects/fuse2/usr.bin/nfsstat/nfsstat.c =================================================================== --- projects/fuse2/usr.bin/nfsstat/nfsstat.c (revision 350434) +++ projects/fuse2/usr.bin/nfsstat/nfsstat.c (revision 350435) @@ -1,1206 +1,1206 @@ /* * Copyright (c) 1983, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 2004, 2008, 2009 Silicon Graphics International Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1989, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)nfsstat.c 8.2 (Berkeley) 3/31/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int widemode = 0; static int zflag = 0; static int printtitle = 1; static struct nfsstatsv1 ext_nfsstats; static int extra_output = 0; static void intpr(int, int); static void printhdr(int, int, int); static void usage(void); static char *sperc1(int, int); static char *sperc2(int, int); static void exp_intpr(int, int, int); static void exp_sidewaysintpr(u_int, int, int, int); static void compute_new_stats(struct nfsstatsv1 *cur_stats, struct nfsstatsv1 *prev_stats, int curop, long double etime, long double *mbsec, long double *kb_per_transfer, long double *transfers_per_second, long double *ms_per_transfer, uint64_t *queue_len, long double *busy_pct); #define DELTA(field) (nfsstats.field - lastst.field) #define STAT_TYPE_READ 0 #define STAT_TYPE_WRITE 1 #define STAT_TYPE_COMMIT 2 #define NUM_STAT_TYPES 3 struct stattypes { int stat_type; int nfs_type; }; static struct stattypes statstruct[] = { {STAT_TYPE_READ, NFSV4OP_READ}, {STAT_TYPE_WRITE, NFSV4OP_WRITE}, {STAT_TYPE_COMMIT, NFSV4OP_COMMIT} }; #define STAT_TYPE_TO_NFS(stat_type) statstruct[stat_type].nfs_type #define NFSSTAT_XO_VERSION "1" int main(int argc, char **argv) { u_int interval; int clientOnly = -1; int serverOnly = -1; int newStats = 0; int ch; char *memf, *nlistf; int mntlen, i; char buf[1024]; struct statfs *mntbuf; struct nfscl_dumpmntopts dumpmntopts; interval = 0; memf = nlistf = NULL; argc = xo_parse_args(argc, argv); if (argc < 0) exit(1); xo_set_version(NFSSTAT_XO_VERSION); while ((ch = getopt(argc, argv, "cdEesWM:mN:w:zq")) != -1) switch(ch) { case 'M': memf = optarg; break; case 'm': /* Display mount options for NFS mount points. */ mntlen = getmntinfo(&mntbuf, MNT_NOWAIT); for (i = 0; i < mntlen; i++) { if (strcmp(mntbuf->f_fstypename, "nfs") == 0) { dumpmntopts.ndmnt_fname = mntbuf->f_mntonname; dumpmntopts.ndmnt_buf = buf; dumpmntopts.ndmnt_blen = sizeof(buf); if (nfssvc(NFSSVC_DUMPMNTOPTS, &dumpmntopts) >= 0) printf("%s on %s\n%s\n", mntbuf->f_mntfromname, mntbuf->f_mntonname, buf); else if (errno == EPERM) errx(1, "Only priviledged users" " can use the -m option"); } mntbuf++; } exit(0); case 'N': nlistf = optarg; break; case 'W': widemode = 1; break; case 'w': interval = atoi(optarg); break; case 'c': clientOnly = 1; if (serverOnly < 0) serverOnly = 0; break; case 'd': newStats = 1; if (interval == 0) interval = 1; break; case 's': serverOnly = 1; if (clientOnly < 0) clientOnly = 0; break; case 'z': zflag = 1; break; case 'E': if (extra_output != 0) xo_err(1, "-e and -E are mutually exclusive"); extra_output = 2; break; case 'e': if (extra_output != 0) xo_err(1, "-e and -E are mutually exclusive"); extra_output = 1; break; case 'q': printtitle = 0; break; case '?': default: usage(); } argc -= optind; argv += optind; #define BACKWARD_COMPATIBILITY #ifdef BACKWARD_COMPATIBILITY if (*argv) { interval = atoi(*argv); if (*++argv) { nlistf = *argv; if (*++argv) memf = *argv; } } #endif if (modfind("nfscommon") < 0) xo_err(1, "NFS client/server not loaded"); if (interval) { exp_sidewaysintpr(interval, clientOnly, serverOnly, newStats); } else { xo_open_container("nfsstat"); if (extra_output != 0) exp_intpr(clientOnly, serverOnly, extra_output - 1); else intpr(clientOnly, serverOnly); xo_close_container("nfsstat"); } xo_finish(); exit(0); } /* * Print a description of the nfs stats. */ static void intpr(int clientOnly, int serverOnly) { int nfssvc_flag; nfssvc_flag = NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT; if (zflag != 0) { if (clientOnly != 0) nfssvc_flag |= NFSSVC_ZEROCLTSTATS; if (serverOnly != 0) nfssvc_flag |= NFSSVC_ZEROSRVSTATS; } ext_nfsstats.vers = NFSSTATS_V1; if (nfssvc(nfssvc_flag, &ext_nfsstats) < 0) xo_err(1, "Can't get stats"); if (clientOnly) { xo_open_container("clientstats"); if (printtitle) xo_emit("{T:Client Info:\n"); xo_open_container("operations"); xo_emit("{T:Rpc Counts:}\n"); xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}" "{T:Lookup/%13.13s}{T:Readlink/%13.13s}" "{T:Read/%13.13s}{T:Write/%13.13s}" "{T:Create/%13.13s}{T:Remove/%13.13s}\n"); xo_emit("{:getattr/%13ju}{:setattr/%13ju}" "{:lookup/%13ju}{:readlink/%13ju}" "{:read/%13ju}{:write/%13ju}" "{:create/%13ju}{:remove/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETATTR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETATTR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOOKUP], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READLINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READ], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITE], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATE], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_REMOVE]); xo_emit("{T:Rename/%13.13s}{T:Link/%13.13s}" "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}" "{T:Rmdir/%13.13s}{T:Readdir/%13.13s}" "{T:RdirPlus/%13.13s}{T:Access/%13.13s}\n"); xo_emit("{:rename/%13ju}{:link/%13ju}" "{:symlink/%13ju}{:mkdir/%13ju}" "{:rmdir/%13ju}{:readdir/%13ju}" "{:rdirplus/%13ju}{:access/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RENAME], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SYMLINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKDIR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RMDIR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIRPLUS], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_ACCESS]); xo_emit("{T:Mknod/%13.13s}{T:Fsstat/%13.13s}" "{T:Fsinfo/%13.13s}{T:PathConf/%13.13s}" "{T:Commit/%13.13s}\n"); xo_emit("{:mknod/%13ju}{:fsstat/%13ju}" "{:fsinfo/%13ju}{:pathconf/%13ju}" "{:commit/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKNOD], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSSTAT], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSINFO], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PATHCONF], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMIT]); xo_close_container("operations"); xo_open_container("rpcs"); xo_emit("{T:Rpc Info:}\n"); xo_emit("{T:TimedOut/%13.13s}{T:Invalid/%13.13s}" "{T:X Replies/%13.13s}{T:Retries/%13.13s}" "{T:Requests/%13.13s}\n"); xo_emit("{:timedout/%13ju}{:invalid/%13ju}" "{:xreplies/%13ju}{:retries/%13ju}" "{:requests/%13ju}\n", (uintmax_t)ext_nfsstats.rpctimeouts, (uintmax_t)ext_nfsstats.rpcinvalid, (uintmax_t)ext_nfsstats.rpcunexpected, (uintmax_t)ext_nfsstats.rpcretries, (uintmax_t)ext_nfsstats.rpcrequests); xo_close_container("rpcs"); xo_open_container("cache"); xo_emit("{T:Cache Info:}\n"); xo_emit("{T:Attr Hits/%13.13s}{T:Attr Misses/%13.13s}" "{T:Lkup Hits/%13.13s}{T:Lkup Misses/%13.13s}" "{T:BioR Hits/%13.13s}{T:BioR Misses/%13.13s}" "{T:BioW Hits/%13.13s}{T:BioW Misses/%13.13s}\n"); xo_emit("{:attrhits/%13ju}{:attrmisses/%13ju}" "{:lkuphits/%13ju}{:lkupmisses/%13ju}" "{:biorhits/%13ju}{:biormisses/%13ju}" "{:biowhits/%13ju}{:biowmisses/%13ju}\n", (uintmax_t)ext_nfsstats.attrcache_hits, (uintmax_t)ext_nfsstats.attrcache_misses, (uintmax_t)ext_nfsstats.lookupcache_hits, (uintmax_t)ext_nfsstats.lookupcache_misses, (uintmax_t)(ext_nfsstats.biocache_reads - ext_nfsstats.read_bios), (uintmax_t)ext_nfsstats.read_bios, (uintmax_t)(ext_nfsstats.biocache_writes - ext_nfsstats.write_bios), (uintmax_t)ext_nfsstats.write_bios); xo_emit("{T:BioRL Hits/%13.13s}{T:BioRL Misses/%13.13s}" "{T:BioD Hits/%13.13s}{T:BioD Misses/%13.13s}" "{T:DirE Hits/%13.13s}{T:DirE Misses/%13.13s}" "{T:Accs Hits/%13.13s}{T:Accs Misses/%13.13s}\n"); xo_emit("{:biosrlhits/%13ju}{:biorlmisses/%13ju}" "{:biodhits/%13ju}{:biodmisses/%13ju}" "{:direhits/%13ju}{:diremisses/%13ju}" "{:accshits/%13ju}{:accsmisses/%13ju}\n", (uintmax_t)(ext_nfsstats.biocache_readlinks - ext_nfsstats.readlink_bios), (uintmax_t)ext_nfsstats.readlink_bios, (uintmax_t)(ext_nfsstats.biocache_readdirs - ext_nfsstats.readdir_bios), (uintmax_t)ext_nfsstats.readdir_bios, (uintmax_t)ext_nfsstats.direofcache_hits, (uintmax_t)ext_nfsstats.direofcache_misses, (uintmax_t)ext_nfsstats.accesscache_hits, (uintmax_t)ext_nfsstats.accesscache_misses); xo_close_container("cache"); xo_close_container("clientstats"); } if (serverOnly) { xo_open_container("serverstats"); xo_emit("{T:Server Info:}\n"); xo_open_container("operations"); xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}" "{T:Lookup/%13.13s}{T:Readlink/%13.13s}" "{T:Read/%13.13s}{T:Write/%13.13s}" "{T:Create/%13.13s}{T:Remove/%13.13s}\n"); xo_emit("{:getattr/%13ju}{:setattr/%13ju}" "{:lookup/%13ju}{:readlink/%13ju}" "{:read/%13ju}{:write/%13ju}" "{:create/%13ju}{:remove/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETATTR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETATTR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUP], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READLINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READ], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_REMOVE]); xo_emit("{T:Rename/%13.13s}{T:Link/%13.13s}" "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}" "{T:Rmdir/%13.13s}{T:Readdir/%13.13s}" "{T:RdirPlus/%13.13s}{T:Access/%13.13s}\n"); xo_emit("{:rename/%13ju}{:link/%13ju}" "{:symlink/%13ju}{:mkdir/%13ju}" "{:rmdir/%13ju}{:readdir/%13ju}" "{:rdirplus/%13ju}{:access/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENAME], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SYMLINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKDIR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RMDIR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIRPLUS], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_ACCESS]); xo_emit("{T:Mknod/%13.13s}{T:Fsstat/%13.13s}" "{T:Fsinfo/%13.13s}{T:PathConf/%13.13s}" "{T:Commit/%13.13s}\n"); xo_emit("{:mknod/%13ju}{:fsstat/%13ju}" "{:fsinfo/%13ju}{:pathconf/%13ju}" "{:commit/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKNOD], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSSTAT], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSINFO], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PATHCONF], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_COMMIT]); xo_close_container("operations"); xo_open_container("server"); - xo_emit("{T:Server Re-Failed:}\n"); - xo_emit("{T:retfailed/%17ju}\n", (uintmax_t)ext_nfsstats.srvrpc_errs); + xo_emit("{T:Server Re-Failed}\n"); + xo_emit("{:retfailed/%16ju}\n", (uintmax_t)ext_nfsstats.srvrpc_errs); - xo_emit("{T:Server Faults:}\n"); - xo_emit("{T:faults/%13ju}\n", (uintmax_t)ext_nfsstats.srv_errs); + xo_emit("{T:Server Faults}\n"); + xo_emit("{:faults/%13ju}\n", (uintmax_t)ext_nfsstats.srv_errs); xo_emit("{T:Server Write Gathering:/%13.13s}\n"); xo_emit("{T:WriteOps/%13.13s}{T:WriteRPC/%13.13s}" "{T:Opsaved/%13.13s}\n"); xo_emit("{:writeops/%13ju}{:writerpc/%13ju}" "{:opsaved/%13ju}\n", /* * The new client doesn't do write gathering. It was * only useful for NFSv2. */ (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE], 0); xo_close_container("server"); xo_open_container("cache"); xo_emit("{T:Server Cache Stats:/%13.13s}\n"); xo_emit("{T:Inprog/%13.13s}{T:Idem/%13.13s}" "{T:Non-Idem/%13.13s}{T:Misses/%13.13s}\n"); xo_emit("{:inprog/%13ju}{:idem/%13ju}" "{:nonidem/%13ju}{:misses/%13ju}\n", (uintmax_t)ext_nfsstats.srvcache_inproghits, (uintmax_t)ext_nfsstats.srvcache_idemdonehits, (uintmax_t)ext_nfsstats.srvcache_nonidemdonehits, (uintmax_t)ext_nfsstats.srvcache_misses); xo_close_container("cache"); xo_close_container("serverstats"); } } static void printhdr(int clientOnly, int serverOnly, int newStats) { if (newStats) { printf(" [%s Read %s] [%s Write %s] " "%s[=========== Total ============]\n" " KB/t tps MB/s%s KB/t tps MB/s%s " "%sKB/t tps MB/s ms ql %%b", widemode ? "========" : "=====", widemode ? "========" : "=====", widemode ? "========" : "=====", widemode ? "=======" : "====", widemode ? "[Commit ] " : "", widemode ? " ms" : "", widemode ? " ms" : "", widemode ? "tps ms " : ""); } else { printf("%s%6.6s %6.6s %6.6s %6.6s %6.6s %6.6s %6.6s %6.6s", ((serverOnly && clientOnly) ? " " : " "), "GtAttr", "Lookup", "Rdlink", "Read", "Write", "Rename", "Access", "Rddir"); if (widemode && clientOnly) { printf(" Attr Lkup BioR BioW Accs BioD"); } } printf("\n"); fflush(stdout); } static void usage(void) { (void)fprintf(stderr, "usage: nfsstat [-cdemszW] [-M core] [-N system] [-w wait]\n"); exit(1); } static char SPBuf[64][8]; static int SPIndex; static char * sperc1(int hits, int misses) { char *p = SPBuf[SPIndex]; if (hits + misses) { sprintf(p, "%3d%%", (int)(char)((quad_t)hits * 100 / (hits + misses))); } else { sprintf(p, " -"); } SPIndex = (SPIndex + 1) & 63; return(p); } static char * sperc2(int ttl, int misses) { char *p = SPBuf[SPIndex]; if (ttl) { sprintf(p, "%3d%%", (int)(char)((quad_t)(ttl - misses) * 100 / ttl)); } else { sprintf(p, " -"); } SPIndex = (SPIndex + 1) & 63; return(p); } #define DELTA_T(field) \ devstat_compute_etime(&cur_stats->field, \ (prev_stats ? &prev_stats->field : NULL)) /* * XXX KDM mostly copied from ctlstat. We should commonize the code (and * the devstat code) somehow. */ static void compute_new_stats(struct nfsstatsv1 *cur_stats, struct nfsstatsv1 *prev_stats, int curop, long double etime, long double *mbsec, long double *kb_per_transfer, long double *transfers_per_second, long double *ms_per_transfer, uint64_t *queue_len, long double *busy_pct) { uint64_t total_bytes = 0, total_operations = 0; struct bintime total_time_bt; struct timespec total_time_ts; bzero(&total_time_bt, sizeof(total_time_bt)); bzero(&total_time_ts, sizeof(total_time_ts)); total_bytes = cur_stats->srvbytes[curop]; total_operations = cur_stats->srvops[curop]; if (prev_stats != NULL) { total_bytes -= prev_stats->srvbytes[curop]; total_operations -= prev_stats->srvops[curop]; } *mbsec = total_bytes; *mbsec /= 1024 * 1024; if (etime > 0.0) { *busy_pct = DELTA_T(busytime); if (*busy_pct < 0) *busy_pct = 0; *busy_pct /= etime; *busy_pct *= 100; if (*busy_pct < 0) *busy_pct = 0; *mbsec /= etime; } else { *busy_pct = 0; *mbsec = 0; } *kb_per_transfer = total_bytes; *kb_per_transfer /= 1024; if (total_operations > 0) *kb_per_transfer /= total_operations; else *kb_per_transfer = 0; if (etime > 0.0) { *transfers_per_second = total_operations; *transfers_per_second /= etime; } else { *transfers_per_second = 0.0; } if (total_operations > 0) { *ms_per_transfer = DELTA_T(srvduration[curop]); *ms_per_transfer /= total_operations; *ms_per_transfer *= 1000; } else *ms_per_transfer = 0.0; *queue_len = cur_stats->srvstartcnt - cur_stats->srvdonecnt; } /* * Print a description of the nfs stats for the client/server, * including NFSv4.1. */ static void exp_intpr(int clientOnly, int serverOnly, int nfs41) { int nfssvc_flag; xo_open_container("nfsv4"); nfssvc_flag = NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT; if (zflag != 0) { if (clientOnly != 0) nfssvc_flag |= NFSSVC_ZEROCLTSTATS; if (serverOnly != 0) nfssvc_flag |= NFSSVC_ZEROSRVSTATS; } ext_nfsstats.vers = NFSSTATS_V1; if (nfssvc(nfssvc_flag, &ext_nfsstats) < 0) xo_err(1, "Can't get stats"); if (clientOnly != 0) { xo_open_container("clientstats"); xo_open_container("operations"); if (printtitle) { xo_emit("{T:Client Info:}\n"); xo_emit("{T:RPC Counts:}\n"); } xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}" "{T:Lookup/%13.13s}{T:Readlink/%13.13s}" "{T:Read/%13.13s}{T:Write/%13.13s}\n"); xo_emit("{:getattr/%13ju}{:setattr/%13ju}{:lookup/%13ju}" "{:readlink/%13ju}{:read/%13ju}{:write/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETATTR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETATTR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOOKUP], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READLINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READ], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITE]); xo_emit("{T:Create/%13.13s}{T:Remove/%13.13s}" "{T:Rename/%13.13s}{T:Link/%13.13s}" "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}\n"); xo_emit("{:create/%13ju}{:remove/%13ju}{:rename/%13ju}" "{:link/%13ju}{:symlink/%13ju}{:mkdir/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATE], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_REMOVE], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RENAME], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SYMLINK], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKDIR]); xo_emit("{T:Rmdir/%13.13s}{T:Readdir/%13.13s}" "{T:RdirPlus/%13.13s}{T:Access/%13.13s}" "{T:Mknod/%13.13s}{T:Fsstat/%13.13s}\n"); xo_emit("{:rmdir/%13ju}{:readdir/%13ju}{:rdirplus/%13ju}" "{:access/%13ju}{:mknod/%13ju}{:fsstat/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RMDIR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIR], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIRPLUS], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_ACCESS], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKNOD], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSSTAT]); xo_emit("{T:FSinfo/%13.13s}{T:pathConf/%13.13s}" "{T:Commit/%13.13s}{T:SetClId/%13.13s}" "{T:SetClIdCf/%13.13s}{T:Lock/%13.13s}\n"); xo_emit("{:fsinfo/%13ju}{:pathconf/%13ju}{:commit/%13ju}" "{:setclientid/%13ju}{:setclientidcf/%13ju}{:lock/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSINFO], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PATHCONF], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMIT], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETCLIENTID], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETCLIENTIDCFRM], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCK]); xo_emit("{T:LockT/%13.13s}{T:LockU/%13.13s}" "{T:Open/%13.13s}{T:OpenCfr/%13.13s}\n"); xo_emit("{:lockt/%13ju}{:locku/%13ju}" "{:open/%13ju}{:opencfr/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCKT], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCKU], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPEN], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENCONFIRM]); if (nfs41) { xo_open_container("nfsv41"); xo_emit("{T:OpenDownGr/%13.13s}{T:Close/%13.13s}\n"); xo_emit("{:opendowngr/%13ju}{:close/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENDOWNGRADE], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CLOSE]); xo_emit("{T:RelLckOwn/%13.13s}{T:FreeStateID/%13.13s}" "{T:PutRootFH/%13.13s}{T:DelegRet/%13.13s}" "{T:GetAcl/%13.13s}{T:SetAcl/%13.13s}\n"); xo_emit("{:rellckown/%13ju}{:freestateid/%13ju}" "{:getacl/%13ju}{:delegret/%13ju}" "{:getacl/%13ju}{:setacl/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RELEASELCKOWN], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FREESTATEID], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PUTROOTFH], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DELEGRETURN], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETACL], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETACL]); xo_emit("{T:ExchangeId/%13.13s}{T:CreateSess/%13.13s}" "{T:DestroySess/%13.13s}{T:DestroyClId/%13.13s}" "{T:LayoutGet/%13.13s}{T:GetDevInfo/%13.13s}\n"); xo_emit("{:exchangeid/%13ju}{:createsess/%13ju}" "{:destroysess/%13ju}{:destroyclid/%13ju}" "{:layoutget/%13ju}{:getdevinfo/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_EXCHANGEID], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATESESSION], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DESTROYSESSION], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DESTROYCLIENT], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTGET], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETDEVICEINFO]); xo_emit("{T:LayoutCommit/%13.13s}{T:LayoutReturn/%13.13s}" "{T:ReclaimCompl/%13.13s}{T:ReadDataS/%13.13s}" "{T:WriteDataS/%13.13s}{T:CommitDataS/%13.13s}\n"); xo_emit("{:layoutcomit/%13ju}{:layoutreturn/%13ju}" "{:reclaimcompl/%13ju}{:readdatas/%13ju}" "{:writedatas/%13ju}{:commitdatas/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTCOMMIT], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTRETURN], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RECLAIMCOMPL], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDS], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITEDS], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMITDS]); xo_emit("{T:OpenLayout/%13.13s}{T:CreateLayout/%13.13s}\n"); xo_emit("{:openlayout/%13ju}{:createlayout/%13ju}\n", (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENLAYGET], (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATELAYGET]); xo_close_container("nfsv41"); } xo_close_container("operations"); xo_open_container("client"); xo_emit("{T:OpenOwner/%13.13s}{T:Opens/%13.13s}" "{T:LockOwner/%13.13s}{T:Locks/%13.13s}" "{T:Delegs/%13.13s}{T:LocalOwn/%13.13s}\n"); xo_emit("{:openowner/%13ju}{:opens/%13ju}" "{:lockowner/%13ju}{:locks/%13ju}" "{:delegs/%13ju}{:localown/%13ju}\n", (uintmax_t)ext_nfsstats.clopenowners, (uintmax_t)ext_nfsstats.clopens, (uintmax_t)ext_nfsstats.cllockowners, (uintmax_t)ext_nfsstats.cllocks, (uintmax_t)ext_nfsstats.cldelegates, (uintmax_t)ext_nfsstats.cllocalopenowners); xo_emit("{T:LocalOpen/%13.13s}{T:LocalLown/%13.13s}" "{T:LocalLock/%13.13s}\n"); xo_emit("{:localopen/%13ju}{:locallown/%13ju}" "{:locallock/%13ju}\n", (uintmax_t)ext_nfsstats.cllocalopens, (uintmax_t)ext_nfsstats.cllocallockowners, (uintmax_t)ext_nfsstats.cllocallocks); xo_close_container("client"); xo_open_container("rpc"); if (printtitle) xo_emit("{T:Rpc Info:}\n"); xo_emit("{T:TimedOut/%13.13s}{T:Invalid/%13.13s}" "{T:X Replies/%13.13s}{T:Retries/%13.13s}" "{T:Requests/%13.13s}\n"); xo_emit("{:timedout/%13ju}{:invalid/%13ju}" "{:xreplies/%13ju}{:retries/%13ju}" "{:requests/%13ju}\n", (uintmax_t)ext_nfsstats.rpctimeouts, (uintmax_t)ext_nfsstats.rpcinvalid, (uintmax_t)ext_nfsstats.rpcunexpected, (uintmax_t)ext_nfsstats.rpcretries, (uintmax_t)ext_nfsstats.rpcrequests); xo_close_container("rpc"); xo_open_container("cache"); if (printtitle) xo_emit("{T:Cache Info:}\n"); xo_emit("{T:Attr Hits/%13.13s}{T:Attr Misses/%13.13s}" "{T:Lkup Hits/%13.13s}{T:Lkup Misses/%13.13s}\n"); xo_emit("{:attrhits/%13ju}{:attrmisses/%13ju}" "{:lkuphits/%13ju}{:lkupmisses/%13ju}\n", (uintmax_t)ext_nfsstats.attrcache_hits, (uintmax_t)ext_nfsstats.attrcache_misses, (uintmax_t)ext_nfsstats.lookupcache_hits, (uintmax_t)ext_nfsstats.lookupcache_misses); xo_emit("{T:BioR Hits/%13.13s}{T:BioR Misses/%13.13s}" "{T:BioW Hits/%13.13s}{T:BioW Misses/%13.13s}\n"); xo_emit("{:biorhits/%13ju}{:biormisses/%13ju}" "{:biowhits/%13ju}{:biowmisses/%13ju}\n", (uintmax_t)(ext_nfsstats.biocache_reads - ext_nfsstats.read_bios), (uintmax_t)ext_nfsstats.read_bios, (uintmax_t)(ext_nfsstats.biocache_writes - ext_nfsstats.write_bios), (uintmax_t)ext_nfsstats.write_bios); xo_emit("{T:BioRL Hits/%13.13s}{T:BioRL Misses/%13.13s}" "{T:BioD Hits/%13.13s}{T:BioD Misses/%13.13s}\n"); xo_emit("{:biorlhits/%13ju}{:biorlmisses/%13ju}" "{:biodhits/%13ju}{:biodmisses/%13ju}\n", (uintmax_t)(ext_nfsstats.biocache_readlinks - ext_nfsstats.readlink_bios), (uintmax_t)ext_nfsstats.readlink_bios, (uintmax_t)(ext_nfsstats.biocache_readdirs - ext_nfsstats.readdir_bios), (uintmax_t)ext_nfsstats.readdir_bios); xo_emit("{T:DirE Hits/%13.13s}{T:DirE Misses/%13.13s}\n"); xo_emit("{:direhits/%13ju}{:diremisses/%13ju}\n", (uintmax_t)ext_nfsstats.direofcache_hits, (uintmax_t)ext_nfsstats.direofcache_misses); xo_open_container("cache"); xo_close_container("clientstats"); } if (serverOnly != 0) { xo_open_container("serverstats"); xo_open_container("operations"); if (printtitle) xo_emit("{T:Server Info:}\n"); xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}" "{T:Lookup/%13.13s}{T:Readlink/%13.13s}" "{T:Read/%13.13s}{T:Write/%13.13s}\n"); xo_emit("{:getattr/%13ju}{:setattr/%13ju}{:lookup/%13ju}" "{:readlink/%13ju}{:read/%13ju}{:write/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETATTR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETATTR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUP], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READLINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READ], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE]); xo_emit("{T:Create/%13.13s}{T:Remove/%13.13s}" "{T:Rename/%13.13s}{T:Link/%13.13s}" "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}\n"); xo_emit("{:create/%13ju}{:remove/%13ju}{:rename/%13ju}" "{:link/%13ju}{:symlink/%13ju}{:mkdir/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_V3CREATE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_REMOVE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENAME], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SYMLINK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKDIR]); xo_emit("{T:Rmdir/%13.13s}{T:Readdir/%13.13s}" "{T:RdirPlus/%13.13s}{T:Access/%13.13s}" "{T:Mknod/%13.13s}{T:Fsstat/%13.13s}\n"); xo_emit("{:rmdir/%13ju}{:readdir/%13ju}{:rdirplus/%13ju}" "{:access/%13ju}{:mknod/%13ju}{:fsstat/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RMDIR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIRPLUS], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_ACCESS], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKNOD], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSSTAT]); xo_emit("{T:FSinfo/%13.13s}{T:pathConf/%13.13s}" "{T:Commit/%13.13s}{T:LookupP/%13.13s}" "{T:SetClId/%13.13s}{T:SetClIdCf/%13.13s}\n"); xo_emit("{:fsinfo/%13ju}{:pathconf/%13ju}{:commit/%13ju}" "{:lookupp/%13ju}{:setclientid/%13ju}{:setclientidcfrm/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSINFO], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PATHCONF], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_COMMIT], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUPP], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETCLIENTID], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETCLIENTIDCFRM]); xo_emit("{T:Open/%13.13s}{T:OpenAttr/%13.13s}" "{T:OpenDwnGr/%13.13s}{T:OpenCfrm/%13.13s}" "{T:DelePurge/%13.13s}{T:DelRet/%13.13s}\n"); xo_emit("{:open/%13ju}{:openattr/%13ju}{:opendwgr/%13ju}" "{:opencfrm/%13ju}{:delepurge/%13ju}{:delreg/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPEN], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENATTR], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENDOWNGRADE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENCONFIRM], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DELEGPURGE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DELEGRETURN]); xo_emit("{T:GetFH/%13.13s}{T:Lock/%13.13s}" "{T:LockT/%13.13s}{T:LockU/%13.13s}" "{T:Close/%13.13s}{T:Verify/%13.13s}\n"); xo_emit("{:getfh/%13ju}{:lock/%13ju}{:lockt/%13ju}" "{:locku/%13ju}{:close/%13ju}{:verify/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETFH], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCK], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCKT], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCKU], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CLOSE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_VERIFY]); xo_emit("{T:NVerify/%13.13s}{T:PutFH/%13.13s}" "{T:PutPubFH/%13.13s}{T:PutRootFH/%13.13s}" "{T:Renew/%13.13s}{T:RestoreFH/%13.13s}\n"); xo_emit("{:nverify/%13ju}{:putfh/%13ju}{:putpubfh/%13ju}" "{:putrootfh/%13ju}{:renew/%13ju}{:restore/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_NVERIFY], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTFH], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTPUBFH], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTROOTFH], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENEW], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RESTOREFH]); xo_emit("{T:SaveFH/%13.13s}{T:Secinfo/%13.13s}" "{T:RelLockOwn/%13.13s}{T:V4Create/%13.13s}\n"); xo_emit("{:savefh/%13ju}{:secinfo/%13ju}{:rellockown/%13ju}" "{:v4create/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SAVEFH], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SECINFO], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RELEASELCKOWN], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATE]); if (nfs41) { xo_open_container("nfsv41"); xo_emit("{T:BackChannelCtrl/%13.13s}{T:BindConnToSess/%13.13s}" "{T:ExchangeID/%13.13s}{T:CreateSess/%13.13s}" "{T:DestroySess/%13.13s}{T:FreeStateID/%13.13s}\n"); xo_emit("{:backchannelctrl/%13ju}{:bindconntosess/%13ju}" "{:exchangeid/%13ju}{:createsess/%13ju}" "{:destroysess/%13ju}{:freestateid/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_BACKCHANNELCTL], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_BINDCONNTOSESS], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_EXCHANGEID], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATESESSION], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DESTROYSESSION], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FREESTATEID]), xo_emit("{T:GetDirDeleg/%13.13s}{T:GetDevInfo/%13.13s}" "{T:GetDevList/%13.13s}{T:layoutCommit/%13.13s}" "{T:LayoutGet/%13.13s}{T:LayoutReturn/%13.13s}\n"); xo_emit("{:getdirdeleg/%13ju}{:getdevinfo/%13ju}" "{:getdevlist/%13ju}{:layoutcommit/%13ju}" "{:layoutget/%13ju}{:layoutreturn/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDIRDELEG], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDEVINFO], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDEVLIST], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTCOMMIT], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTGET], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTRETURN]); xo_emit("{T:SecInfNoName/%13.13s}{T:Sequence/%13.13s}" "{T:SetSSV/%13.13s}{T:TestStateID/%13.13s}" "{T:WantDeleg/%13.13s}{T:DestroyClId/%13.13s}\n"); xo_emit("{:secinfnoname/%13ju}{:sequence/%13ju}" "{:setssv/%13ju}{:teststateid/%13ju}{:wantdeleg/%13ju}" "{:destroyclid/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SECINFONONAME], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SEQUENCE], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETSSV], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_TESTSTATEID], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WANTDELEG], (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DESTROYCLIENTID]); xo_emit("{T:ReclaimCompl/%13.13s}\n"); xo_emit("{:reclaimcompl/%13ju}\n", (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RECLAIMCOMPL]); xo_close_container("nfsv41"); } xo_close_container("operations"); if (printtitle) xo_emit("{T:Server:}\n"); xo_open_container("server"); xo_emit("{T:Retfailed/%13.13s}{T:Faults/%13.13s}" "{T:Clients/%13.13s}\n"); xo_emit("{:retfailed/%13ju}{:faults/%13ju}{:clients/%13ju}\n", (uintmax_t)ext_nfsstats.srv_errs, (uintmax_t)ext_nfsstats.srvrpc_errs, (uintmax_t)ext_nfsstats.srvclients); xo_emit("{T:OpenOwner/%13.13s}{T:Opens/%13.13s}" "{T:LockOwner/%13.13s}{T:Locks/%13.13s}" "{T:Delegs/%13.13s}\n"); xo_emit("{:openowner/%13ju}{:opens/%13ju}{:lockowner/%13ju}" "{:locks/%13ju}{:delegs/%13ju}\n", (uintmax_t)ext_nfsstats.srvopenowners, (uintmax_t)ext_nfsstats.srvopens, (uintmax_t)ext_nfsstats.srvlockowners, (uintmax_t)ext_nfsstats.srvlocks, (uintmax_t)ext_nfsstats.srvdelegates); xo_close_container("server"); if (printtitle) xo_emit("{T:Server Cache Stats:}\n"); xo_open_container("cache"); xo_emit("{T:Inprog/%13.13s}{T:Idem/%13.13s}" "{T:Non-idem/%13.13s}{T:Misses/%13.13s}" "{T:CacheSize/%13.13s}{T:TCPPeak/%13.13s}\n"); xo_emit("{:inprog/%13ju}{:idem/%13ju}{:nonidem/%13ju}" "{:misses/%13ju}{:cachesize/%13ju}{:tcppeak/%13ju}\n", (uintmax_t)ext_nfsstats.srvcache_inproghits, (uintmax_t)ext_nfsstats.srvcache_idemdonehits, (uintmax_t)ext_nfsstats.srvcache_nonidemdonehits, (uintmax_t)ext_nfsstats.srvcache_misses, (uintmax_t)ext_nfsstats.srvcache_size, (uintmax_t)ext_nfsstats.srvcache_tcppeak); xo_close_container("cache"); xo_close_container("serverstats"); } xo_close_container("nfsv4"); } static void compute_totals(struct nfsstatsv1 *total_stats, struct nfsstatsv1 *cur_stats) { int i; bzero(total_stats, sizeof(*total_stats)); for (i = 0; i < (NFSV42_NOPS + NFSV4OP_FAKENOPS); i++) { total_stats->srvbytes[0] += cur_stats->srvbytes[i]; total_stats->srvops[0] += cur_stats->srvops[i]; bintime_add(&total_stats->srvduration[0], &cur_stats->srvduration[i]); total_stats->srvrpccnt[i] = cur_stats->srvrpccnt[i]; } total_stats->srvstartcnt = cur_stats->srvstartcnt; total_stats->srvdonecnt = cur_stats->srvdonecnt; total_stats->busytime = cur_stats->busytime; } /* * Print a running summary of nfs statistics for the experimental client and/or * server. * Repeat display every interval seconds, showing statistics * collected over that interval. Assumes that interval is non-zero. * First line printed at top of screen is always cumulative. */ static void exp_sidewaysintpr(u_int interval, int clientOnly, int serverOnly, int newStats) { struct nfsstatsv1 nfsstats, lastst, *ext_nfsstatsp; struct nfsstatsv1 curtotal, lasttotal; struct timespec ts, lastts; int hdrcnt = 1; ext_nfsstatsp = &lastst; ext_nfsstatsp->vers = NFSSTATS_V1; if (nfssvc(NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT, ext_nfsstatsp) < 0) err(1, "Can't get stats"); clock_gettime(CLOCK_MONOTONIC, &lastts); compute_totals(&lasttotal, ext_nfsstatsp); sleep(interval); for (;;) { ext_nfsstatsp = &nfsstats; ext_nfsstatsp->vers = NFSSTATS_V1; if (nfssvc(NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT, ext_nfsstatsp) < 0) err(1, "Can't get stats"); clock_gettime(CLOCK_MONOTONIC, &ts); if (--hdrcnt == 0) { printhdr(clientOnly, serverOnly, newStats); if (newStats) hdrcnt = 20; else if (clientOnly && serverOnly) hdrcnt = 10; else hdrcnt = 20; } if (clientOnly && newStats == 0) { printf("%s %6ju %6ju %6ju %6ju %6ju %6ju %6ju %6ju", ((clientOnly && serverOnly) ? "Client:" : ""), (uintmax_t)DELTA(rpccnt[NFSPROC_GETATTR]), (uintmax_t)DELTA(rpccnt[NFSPROC_LOOKUP]), (uintmax_t)DELTA(rpccnt[NFSPROC_READLINK]), (uintmax_t)DELTA(rpccnt[NFSPROC_READ]), (uintmax_t)DELTA(rpccnt[NFSPROC_WRITE]), (uintmax_t)DELTA(rpccnt[NFSPROC_RENAME]), (uintmax_t)DELTA(rpccnt[NFSPROC_ACCESS]), (uintmax_t)(DELTA(rpccnt[NFSPROC_READDIR]) + DELTA(rpccnt[NFSPROC_READDIRPLUS])) ); if (widemode) { printf(" %s %s %s %s %s %s", sperc1(DELTA(attrcache_hits), DELTA(attrcache_misses)), sperc1(DELTA(lookupcache_hits), DELTA(lookupcache_misses)), sperc2(DELTA(biocache_reads), DELTA(read_bios)), sperc2(DELTA(biocache_writes), DELTA(write_bios)), sperc1(DELTA(accesscache_hits), DELTA(accesscache_misses)), sperc2(DELTA(biocache_readdirs), DELTA(readdir_bios)) ); } printf("\n"); } if (serverOnly && newStats) { long double cur_secs, last_secs, etime; long double mbsec; long double kb_per_transfer; long double transfers_per_second; long double ms_per_transfer; uint64_t queue_len; long double busy_pct; int i; cur_secs = ts.tv_sec + ((long double)ts.tv_nsec / 1000000000); last_secs = lastts.tv_sec + ((long double)lastts.tv_nsec / 1000000000); etime = cur_secs - last_secs; compute_totals(&curtotal, &nfsstats); for (i = 0; i < NUM_STAT_TYPES; i++) { compute_new_stats(&nfsstats, &lastst, STAT_TYPE_TO_NFS(i), etime, &mbsec, &kb_per_transfer, &transfers_per_second, &ms_per_transfer, &queue_len, &busy_pct); if (i == STAT_TYPE_COMMIT) { if (widemode == 0) continue; printf("%2.0Lf %7.2Lf ", transfers_per_second, ms_per_transfer); } else { printf("%5.2Lf %5.0Lf %7.2Lf ", kb_per_transfer, transfers_per_second, mbsec); if (widemode) printf("%5.2Lf ", ms_per_transfer); } } compute_new_stats(&curtotal, &lasttotal, 0, etime, &mbsec, &kb_per_transfer, &transfers_per_second, &ms_per_transfer, &queue_len, &busy_pct); printf("%5.2Lf %5.0Lf %7.2Lf %5.2Lf %3ju %3.0Lf\n", kb_per_transfer, transfers_per_second, mbsec, ms_per_transfer, queue_len, busy_pct); } else if (serverOnly) { printf("%s %6ju %6ju %6ju %6ju %6ju %6ju %6ju %6ju", ((clientOnly && serverOnly) ? "Server:" : ""), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_GETATTR]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_LOOKUP]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_READLINK]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_READ]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_WRITE]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_RENAME]), (uintmax_t)DELTA(srvrpccnt[NFSV4OP_ACCESS]), (uintmax_t)(DELTA(srvrpccnt[NFSV4OP_READDIR]) + DELTA(srvrpccnt[NFSV4OP_READDIRPLUS]))); printf("\n"); } bcopy(&nfsstats, &lastst, sizeof(lastst)); bcopy(&curtotal, &lasttotal, sizeof(lasttotal)); lastts = ts; fflush(stdout); sleep(interval); } /*NOTREACHED*/ } Index: projects/fuse2/usr.bin/printf/printf.1 =================================================================== --- projects/fuse2/usr.bin/printf/printf.1 (revision 350434) +++ projects/fuse2/usr.bin/printf/printf.1 (revision 350435) @@ -1,382 +1,385 @@ .\" Copyright (c) 1989, 1990, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" This code is derived from software contributed to Berkeley by .\" the Institute of Electrical and Electronics Engineers, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)printf.1 8.1 (Berkeley) 6/6/93 .\" $FreeBSD$ .\" -.Dd April 21, 2014 +.Dd July 29, 2019 .Dt PRINTF 1 .Os .Sh NAME .Nm printf .Nd formatted output .Sh SYNOPSIS .Nm .Ar format Op Ar arguments ... .Sh DESCRIPTION The .Nm utility formats and prints its arguments, after the first, under control of the .Ar format . The .Ar format is a character string which contains three types of objects: plain characters, which are simply copied to standard output, character escape sequences which are converted and copied to the standard output, and format specifications, each of which causes printing of the next successive .Ar argument . .Pp The .Ar arguments after the first are treated as strings if the corresponding format is either .Cm c , b or .Cm s ; otherwise it is evaluated as a C constant, with the following extensions: .Pp .Bl -bullet -offset indent -compact .It A leading plus or minus sign is allowed. .It If the leading character is a single or double quote, the value is the character code of the next character. .El .Pp The format string is reused as often as necessary to satisfy the .Ar arguments . Any extra format specifications are evaluated with zero or the null string. .Pp Character escape sequences are in backslash notation as defined in the .St -ansiC , with extensions. The characters and their meanings are as follows: .Pp .Bl -tag -width Ds -offset indent -compact .It Cm \ea Write a character. .It Cm \eb Write a character. -.It Cm \ec -Ignore remaining characters in this string. .It Cm \ef Write a character. .It Cm \en Write a character. .It Cm \er Write a character. .It Cm \et Write a character. .It Cm \ev Write a character. .It Cm \e\' Write a character. .It Cm \e\e Write a backslash character. .It Cm \e Ns Ar num Write a byte whose value is the 1-, 2-, or 3-digit octal number .Ar num . Multibyte characters can be constructed using multiple .Cm \e Ns Ar num sequences. .El .Pp Each format specification is introduced by the percent character (``%''). The remainder of the format specification includes, in the following order: .Bl -tag -width Ds .It "Zero or more of the following flags:" .Bl -tag -width Ds .It Cm # A `#' character specifying that the value should be printed in an ``alternate form''. For .Cm b , c , d , s and .Cm u formats, this option has no effect. For the .Cm o formats the precision of the number is increased to force the first character of the output string to a zero. For the .Cm x .Pq Cm X format, a non-zero result has the string .Li 0x .Pq Li 0X prepended to it. For .Cm a , A , e , E , f , F , g and .Cm G formats, the result will always contain a decimal point, even if no digits follow the point (normally, a decimal point only appears in the results of those formats if a digit follows the decimal point). For .Cm g and .Cm G formats, trailing zeros are not removed from the result as they would otherwise be; .It Cm \&\- A minus sign `\-' which specifies .Em left adjustment of the output in the indicated field; .It Cm \&+ A `+' character specifying that there should always be a sign placed before the number when using signed formats. .It Sq \&\ \& A space specifying that a blank should be left before a positive number for a signed format. A `+' overrides a space if both are used; .It Cm \&0 A zero `0' character indicating that zero-padding should be used rather than blank-padding. A `\-' overrides a `0' if both are used; .El .It "Field Width:" An optional digit string specifying a .Em field width ; if the output string has fewer bytes than the field width it will be blank-padded on the left (or right, if the left-adjustment indicator has been given) to make up the field width (note that a leading zero is a flag, but an embedded zero is part of a field width); .It Precision: An optional period, .Sq Cm \&.\& , followed by an optional digit string giving a .Em precision which specifies the number of digits to appear after the decimal point, for .Cm e and .Cm f formats, or the maximum number of bytes to be printed from a string; if the digit string is missing, the precision is treated as zero; .It Format: A character which indicates the type of format to use (one of .Cm diouxXfFeEgGaAcsb ) . The uppercase formats differ from their lowercase counterparts only in that the output of the former is entirely in uppercase. The floating-point format specifiers .Pq Cm fFeEgGaA may be prefixed by an .Cm L to request that additional precision be used, if available. .El .Pp A field width or precision may be .Sq Cm \&* instead of a digit string. In this case an .Ar argument supplies the field width or precision. .Pp The format characters and their meanings are: .Bl -tag -width Fl .It Cm diouXx The .Ar argument is printed as a signed decimal (d or i), unsigned octal, unsigned decimal, or unsigned hexadecimal (X or x), respectively. .It Cm fF The .Ar argument is printed in the style `[\-]ddd.ddd' where the number of d's after the decimal point is equal to the precision specification for the argument. If the precision is missing, 6 digits are given; if the precision is explicitly 0, no digits and no decimal point are printed. The values \*[If] and \*[Na] are printed as .Ql inf and .Ql nan , respectively. .It Cm eE The .Ar argument is printed in the style .Cm e .Sm off .Sq Op - Ar d.ddd No \(+- Ar dd .Sm on where there is one digit before the decimal point and the number after is equal to the precision specification for the argument; when the precision is missing, 6 digits are produced. The values \*[If] and \*[Na] are printed as .Ql inf and .Ql nan , respectively. .It Cm gG The .Ar argument is printed in style .Cm f .Pq Cm F or in style .Cm e .Pq Cm E whichever gives full precision in minimum space. .It Cm aA The .Ar argument is printed in style .Sm off .Sq Op - Ar h.hhh No \(+- Li p Ar d .Sm on where there is one digit before the hexadecimal point and the number after is equal to the precision specification for the argument; when the precision is missing, enough digits are produced to convey the argument's exact double-precision floating-point representation. The values \*[If] and \*[Na] are printed as .Ql inf and .Ql nan , respectively. .It Cm c The first byte of .Ar argument is printed. .It Cm s Bytes from the string .Ar argument are printed until the end is reached or until the number of bytes indicated by the precision specification is reached; however if the precision is 0 or missing, the string is printed entirely. .It Cm b As for .Cm s , but interpret character escapes in backslash notation in the string .Ar argument . The permitted escape sequences are slightly different in that octal escapes are .Cm \e0 Ns Ar num instead of -.Cm \e Ns Ar num . +.Cm \e Ns Ar num +and that an additional escape sequence +.Cm \ec +stops further output from this +.Nm +invocation. .It Cm n$ Allows reordering of the output according to .Ar argument . .It Cm \&% Print a `%'; no argument is used. .El .Pp The decimal point character is defined in the program's locale (category .Dv LC_NUMERIC ) . .Pp In no case does a non-existent or small field width cause truncation of a field; padding takes place only if the specified field width exceeds the actual width. .Pp Some shells may provide a builtin .Nm command which is similar or identical to this utility. Consult the .Xr builtin 1 manual page. .Sh EXIT STATUS .Ex -std .Sh COMPATIBILITY The traditional .Bx behavior of converting arguments of numeric formats not beginning with a digit to the .Tn ASCII code of the first character is not supported. .Sh SEE ALSO .Xr builtin 1 , .Xr echo 1 , .Xr sh 1 , .Xr printf 3 .Sh STANDARDS The .Nm command is expected to be compatible with the .St -p1003.2 specification. .Sh HISTORY The .Nm command appeared in .Bx 4.3 Reno . It is modeled after the standard library function, .Xr printf 3 . .Sh CAVEATS .Tn ANSI hexadecimal character constants were deliberately not provided. .Pp Trying to print a dash ("-") as the first character causes .Nm to interpret the dash as a program argument. .Nm -- must be used before .Ar format . .Pp If the locale contains multibyte characters (such as UTF-8), the .Cm c format and .Cm b and .Cm s formats with a precision may not operate as expected. .Sh BUGS Since the floating point numbers are translated from .Tn ASCII to floating-point and then back again, floating-point precision may be lost. (By default, the number is translated to an IEEE-754 double-precision value before being printed. The .Cm L modifier may produce additional precision, depending on the hardware platform.) .Pp The escape sequence \e000 is the string terminator. When present in the argument for the .Cm b format, the argument will be truncated at the \e000 character. .Pp Multibyte characters are not recognized in format strings (this is only a problem if .Ql % can appear inside a multibyte character). Index: projects/fuse2 =================================================================== --- projects/fuse2 (revision 350434) +++ projects/fuse2 (revision 350435) Property changes on: projects/fuse2 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r350391-350426