Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/include/libunwind.h =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/include/libunwind.h (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/include/libunwind.h (revision 308054) @@ -1,607 +1,607 @@ //===---------------------------- libunwind.h -----------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // -// Compatible with libuwind API documented at: +// Compatible with libunwind API documented at: // http://www.nongnu.org/libunwind/man/libunwind(3).html // //===----------------------------------------------------------------------===// #ifndef __LIBUNWIND__ #define __LIBUNWIND__ #include <__libunwind_config.h> #include #include #ifdef __APPLE__ #include #ifdef __arm__ #define LIBUNWIND_AVAIL __attribute__((unavailable)) #else #define LIBUNWIND_AVAIL __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_5_0) #endif #else #define LIBUNWIND_AVAIL #endif /* error codes */ enum { UNW_ESUCCESS = 0, /* no error */ UNW_EUNSPEC = -6540, /* unspecified (general) error */ UNW_ENOMEM = -6541, /* out of memory */ UNW_EBADREG = -6542, /* bad register number */ UNW_EREADONLYREG = -6543, /* attempt to write read-only register */ UNW_ESTOPUNWIND = -6544, /* stop unwinding */ UNW_EINVALIDIP = -6545, /* invalid IP */ UNW_EBADFRAME = -6546, /* bad frame */ UNW_EINVAL = -6547, /* unsupported operation or bad value */ UNW_EBADVERSION = -6548, /* unwind info has unsupported version */ UNW_ENOINFO = -6549 /* no unwind info found */ }; struct unw_context_t { uint64_t data[_LIBUNWIND_CONTEXT_SIZE]; }; typedef struct unw_context_t unw_context_t; struct unw_cursor_t { uint64_t data[_LIBUNWIND_CURSOR_SIZE]; }; typedef struct unw_cursor_t unw_cursor_t; typedef struct unw_addr_space *unw_addr_space_t; typedef int unw_regnum_t; #if _LIBUNWIND_ARM_EHABI typedef uint32_t unw_word_t; typedef uint64_t unw_fpreg_t; #else typedef uint64_t unw_word_t; typedef double unw_fpreg_t; #endif struct unw_proc_info_t { unw_word_t start_ip; /* start address of function */ unw_word_t end_ip; /* address after end of function */ unw_word_t lsda; /* address of language specific data area, */ /* or zero if not used */ unw_word_t handler; /* personality routine, or zero if not used */ unw_word_t gp; /* not used */ unw_word_t flags; /* not used */ uint32_t format; /* compact unwind encoding, or zero if none */ uint32_t unwind_info_size; /* size of dwarf unwind info, or zero if none */ unw_word_t unwind_info; /* address of dwarf unwind info, or zero */ unw_word_t extra; /* mach_header of mach-o image containing func */ }; typedef struct unw_proc_info_t unw_proc_info_t; #ifdef __cplusplus extern "C" { #endif extern int unw_getcontext(unw_context_t *) LIBUNWIND_AVAIL; extern int unw_init_local(unw_cursor_t *, unw_context_t *) LIBUNWIND_AVAIL; extern int unw_step(unw_cursor_t *) LIBUNWIND_AVAIL; extern int unw_get_reg(unw_cursor_t *, unw_regnum_t, unw_word_t *) LIBUNWIND_AVAIL; extern int unw_get_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t *) LIBUNWIND_AVAIL; extern int unw_set_reg(unw_cursor_t *, unw_regnum_t, unw_word_t) LIBUNWIND_AVAIL; extern int unw_set_fpreg(unw_cursor_t *, unw_regnum_t, unw_fpreg_t) LIBUNWIND_AVAIL; extern int unw_resume(unw_cursor_t *) LIBUNWIND_AVAIL; #ifdef __arm__ /* Save VFP registers in FSTMX format (instead of FSTMD). */ extern void unw_save_vfp_as_X(unw_cursor_t *) LIBUNWIND_AVAIL; #endif extern const char *unw_regname(unw_cursor_t *, unw_regnum_t) LIBUNWIND_AVAIL; extern int unw_get_proc_info(unw_cursor_t *, unw_proc_info_t *) LIBUNWIND_AVAIL; extern int unw_is_fpreg(unw_cursor_t *, unw_regnum_t) LIBUNWIND_AVAIL; extern int unw_is_signal_frame(unw_cursor_t *) LIBUNWIND_AVAIL; extern int unw_get_proc_name(unw_cursor_t *, char *, size_t, unw_word_t *) LIBUNWIND_AVAIL; //extern int unw_get_save_loc(unw_cursor_t*, int, unw_save_loc_t*); extern unw_addr_space_t unw_local_addr_space; #ifdef UNW_REMOTE /* * Mac OS X "remote" API for unwinding other processes on same machine * */ extern unw_addr_space_t unw_create_addr_space_for_task(task_t); extern void unw_destroy_addr_space(unw_addr_space_t); extern int unw_init_remote_thread(unw_cursor_t *, unw_addr_space_t, thread_t *); #endif /* UNW_REMOTE */ /* - * traditional libuwind "remote" API + * traditional libunwind "remote" API * NOT IMPLEMENTED on Mac OS X * * extern int unw_init_remote(unw_cursor_t*, unw_addr_space_t, * thread_t*); * extern unw_accessors_t unw_get_accessors(unw_addr_space_t); * extern unw_addr_space_t unw_create_addr_space(unw_accessors_t, int); * extern void unw_flush_cache(unw_addr_space_t, unw_word_t, * unw_word_t); * extern int unw_set_caching_policy(unw_addr_space_t, * unw_caching_policy_t); * extern void _U_dyn_register(unw_dyn_info_t*); * extern void _U_dyn_cancel(unw_dyn_info_t*); */ #ifdef __cplusplus } #endif // architecture independent register numbers enum { UNW_REG_IP = -1, // instruction pointer UNW_REG_SP = -2, // stack pointer }; // 32-bit x86 registers enum { UNW_X86_EAX = 0, UNW_X86_ECX = 1, UNW_X86_EDX = 2, UNW_X86_EBX = 3, UNW_X86_ESP = 4, UNW_X86_EBP = 5, UNW_X86_ESI = 6, UNW_X86_EDI = 7 }; // 64-bit x86_64 registers enum { UNW_X86_64_RAX = 0, UNW_X86_64_RDX = 1, UNW_X86_64_RCX = 2, UNW_X86_64_RBX = 3, UNW_X86_64_RSI = 4, UNW_X86_64_RDI = 5, UNW_X86_64_RBP = 6, UNW_X86_64_RSP = 7, UNW_X86_64_R8 = 8, UNW_X86_64_R9 = 9, UNW_X86_64_R10 = 10, UNW_X86_64_R11 = 11, UNW_X86_64_R12 = 12, UNW_X86_64_R13 = 13, UNW_X86_64_R14 = 14, UNW_X86_64_R15 = 15 }; // 32-bit ppc register numbers enum { UNW_PPC_R0 = 0, UNW_PPC_R1 = 1, UNW_PPC_R2 = 2, UNW_PPC_R3 = 3, UNW_PPC_R4 = 4, UNW_PPC_R5 = 5, UNW_PPC_R6 = 6, UNW_PPC_R7 = 7, UNW_PPC_R8 = 8, UNW_PPC_R9 = 9, UNW_PPC_R10 = 10, UNW_PPC_R11 = 11, UNW_PPC_R12 = 12, UNW_PPC_R13 = 13, UNW_PPC_R14 = 14, UNW_PPC_R15 = 15, UNW_PPC_R16 = 16, UNW_PPC_R17 = 17, UNW_PPC_R18 = 18, UNW_PPC_R19 = 19, UNW_PPC_R20 = 20, UNW_PPC_R21 = 21, UNW_PPC_R22 = 22, UNW_PPC_R23 = 23, UNW_PPC_R24 = 24, UNW_PPC_R25 = 25, UNW_PPC_R26 = 26, UNW_PPC_R27 = 27, UNW_PPC_R28 = 28, UNW_PPC_R29 = 29, UNW_PPC_R30 = 30, UNW_PPC_R31 = 31, UNW_PPC_F0 = 32, UNW_PPC_F1 = 33, UNW_PPC_F2 = 34, UNW_PPC_F3 = 35, UNW_PPC_F4 = 36, UNW_PPC_F5 = 37, UNW_PPC_F6 = 38, UNW_PPC_F7 = 39, UNW_PPC_F8 = 40, UNW_PPC_F9 = 41, UNW_PPC_F10 = 42, UNW_PPC_F11 = 43, UNW_PPC_F12 = 44, UNW_PPC_F13 = 45, UNW_PPC_F14 = 46, UNW_PPC_F15 = 47, UNW_PPC_F16 = 48, UNW_PPC_F17 = 49, UNW_PPC_F18 = 50, UNW_PPC_F19 = 51, UNW_PPC_F20 = 52, UNW_PPC_F21 = 53, UNW_PPC_F22 = 54, UNW_PPC_F23 = 55, UNW_PPC_F24 = 56, UNW_PPC_F25 = 57, UNW_PPC_F26 = 58, UNW_PPC_F27 = 59, UNW_PPC_F28 = 60, UNW_PPC_F29 = 61, UNW_PPC_F30 = 62, UNW_PPC_F31 = 63, UNW_PPC_MQ = 64, UNW_PPC_LR = 65, UNW_PPC_CTR = 66, UNW_PPC_AP = 67, UNW_PPC_CR0 = 68, UNW_PPC_CR1 = 69, UNW_PPC_CR2 = 70, UNW_PPC_CR3 = 71, UNW_PPC_CR4 = 72, UNW_PPC_CR5 = 73, UNW_PPC_CR6 = 74, UNW_PPC_CR7 = 75, UNW_PPC_XER = 76, UNW_PPC_V0 = 77, UNW_PPC_V1 = 78, UNW_PPC_V2 = 79, UNW_PPC_V3 = 80, UNW_PPC_V4 = 81, UNW_PPC_V5 = 82, UNW_PPC_V6 = 83, UNW_PPC_V7 = 84, UNW_PPC_V8 = 85, UNW_PPC_V9 = 86, UNW_PPC_V10 = 87, UNW_PPC_V11 = 88, UNW_PPC_V12 = 89, UNW_PPC_V13 = 90, UNW_PPC_V14 = 91, UNW_PPC_V15 = 92, UNW_PPC_V16 = 93, UNW_PPC_V17 = 94, UNW_PPC_V18 = 95, UNW_PPC_V19 = 96, UNW_PPC_V20 = 97, UNW_PPC_V21 = 98, UNW_PPC_V22 = 99, UNW_PPC_V23 = 100, UNW_PPC_V24 = 101, UNW_PPC_V25 = 102, UNW_PPC_V26 = 103, UNW_PPC_V27 = 104, UNW_PPC_V28 = 105, UNW_PPC_V29 = 106, UNW_PPC_V30 = 107, UNW_PPC_V31 = 108, UNW_PPC_VRSAVE = 109, UNW_PPC_VSCR = 110, UNW_PPC_SPE_ACC = 111, UNW_PPC_SPEFSCR = 112 }; // 64-bit ARM64 registers enum { UNW_ARM64_X0 = 0, UNW_ARM64_X1 = 1, UNW_ARM64_X2 = 2, UNW_ARM64_X3 = 3, UNW_ARM64_X4 = 4, UNW_ARM64_X5 = 5, UNW_ARM64_X6 = 6, UNW_ARM64_X7 = 7, UNW_ARM64_X8 = 8, UNW_ARM64_X9 = 9, UNW_ARM64_X10 = 10, UNW_ARM64_X11 = 11, UNW_ARM64_X12 = 12, UNW_ARM64_X13 = 13, UNW_ARM64_X14 = 14, UNW_ARM64_X15 = 15, UNW_ARM64_X16 = 16, UNW_ARM64_X17 = 17, UNW_ARM64_X18 = 18, UNW_ARM64_X19 = 19, UNW_ARM64_X20 = 20, UNW_ARM64_X21 = 21, UNW_ARM64_X22 = 22, UNW_ARM64_X23 = 23, UNW_ARM64_X24 = 24, UNW_ARM64_X25 = 25, UNW_ARM64_X26 = 26, UNW_ARM64_X27 = 27, UNW_ARM64_X28 = 28, UNW_ARM64_X29 = 29, UNW_ARM64_FP = 29, UNW_ARM64_X30 = 30, UNW_ARM64_LR = 30, UNW_ARM64_X31 = 31, UNW_ARM64_SP = 31, // reserved block UNW_ARM64_D0 = 64, UNW_ARM64_D1 = 65, UNW_ARM64_D2 = 66, UNW_ARM64_D3 = 67, UNW_ARM64_D4 = 68, UNW_ARM64_D5 = 69, UNW_ARM64_D6 = 70, UNW_ARM64_D7 = 71, UNW_ARM64_D8 = 72, UNW_ARM64_D9 = 73, UNW_ARM64_D10 = 74, UNW_ARM64_D11 = 75, UNW_ARM64_D12 = 76, UNW_ARM64_D13 = 77, UNW_ARM64_D14 = 78, UNW_ARM64_D15 = 79, UNW_ARM64_D16 = 80, UNW_ARM64_D17 = 81, UNW_ARM64_D18 = 82, UNW_ARM64_D19 = 83, UNW_ARM64_D20 = 84, UNW_ARM64_D21 = 85, UNW_ARM64_D22 = 86, UNW_ARM64_D23 = 87, UNW_ARM64_D24 = 88, UNW_ARM64_D25 = 89, UNW_ARM64_D26 = 90, UNW_ARM64_D27 = 91, UNW_ARM64_D28 = 92, UNW_ARM64_D29 = 93, UNW_ARM64_D30 = 94, UNW_ARM64_D31 = 95, }; // 32-bit ARM registers. Numbers match DWARF for ARM spec #3.1 Table 1. // Naming scheme uses recommendations given in Note 4 for VFP-v2 and VFP-v3. // In this scheme, even though the 64-bit floating point registers D0-D31 // overlap physically with the 32-bit floating pointer registers S0-S31, // they are given a non-overlapping range of register numbers. // // Commented out ranges are not preserved during unwinding. enum { UNW_ARM_R0 = 0, UNW_ARM_R1 = 1, UNW_ARM_R2 = 2, UNW_ARM_R3 = 3, UNW_ARM_R4 = 4, UNW_ARM_R5 = 5, UNW_ARM_R6 = 6, UNW_ARM_R7 = 7, UNW_ARM_R8 = 8, UNW_ARM_R9 = 9, UNW_ARM_R10 = 10, UNW_ARM_R11 = 11, UNW_ARM_R12 = 12, UNW_ARM_SP = 13, // Logical alias for UNW_REG_SP UNW_ARM_R13 = 13, UNW_ARM_LR = 14, UNW_ARM_R14 = 14, UNW_ARM_IP = 15, // Logical alias for UNW_REG_IP UNW_ARM_R15 = 15, // 16-63 -- OBSOLETE. Used in VFP1 to represent both S0-S31 and D0-D31. UNW_ARM_S0 = 64, UNW_ARM_S1 = 65, UNW_ARM_S2 = 66, UNW_ARM_S3 = 67, UNW_ARM_S4 = 68, UNW_ARM_S5 = 69, UNW_ARM_S6 = 70, UNW_ARM_S7 = 71, UNW_ARM_S8 = 72, UNW_ARM_S9 = 73, UNW_ARM_S10 = 74, UNW_ARM_S11 = 75, UNW_ARM_S12 = 76, UNW_ARM_S13 = 77, UNW_ARM_S14 = 78, UNW_ARM_S15 = 79, UNW_ARM_S16 = 80, UNW_ARM_S17 = 81, UNW_ARM_S18 = 82, UNW_ARM_S19 = 83, UNW_ARM_S20 = 84, UNW_ARM_S21 = 85, UNW_ARM_S22 = 86, UNW_ARM_S23 = 87, UNW_ARM_S24 = 88, UNW_ARM_S25 = 89, UNW_ARM_S26 = 90, UNW_ARM_S27 = 91, UNW_ARM_S28 = 92, UNW_ARM_S29 = 93, UNW_ARM_S30 = 94, UNW_ARM_S31 = 95, // 96-103 -- OBSOLETE. F0-F7. Used by the FPA system. Superseded by VFP. // 104-111 -- wCGR0-wCGR7, ACC0-ACC7 (Intel wireless MMX) UNW_ARM_WR0 = 112, UNW_ARM_WR1 = 113, UNW_ARM_WR2 = 114, UNW_ARM_WR3 = 115, UNW_ARM_WR4 = 116, UNW_ARM_WR5 = 117, UNW_ARM_WR6 = 118, UNW_ARM_WR7 = 119, UNW_ARM_WR8 = 120, UNW_ARM_WR9 = 121, UNW_ARM_WR10 = 122, UNW_ARM_WR11 = 123, UNW_ARM_WR12 = 124, UNW_ARM_WR13 = 125, UNW_ARM_WR14 = 126, UNW_ARM_WR15 = 127, // 128-133 -- SPSR, SPSR_{FIQ|IRQ|ABT|UND|SVC} // 134-143 -- Reserved // 144-150 -- R8_USR-R14_USR // 151-157 -- R8_FIQ-R14_FIQ // 158-159 -- R13_IRQ-R14_IRQ // 160-161 -- R13_ABT-R14_ABT // 162-163 -- R13_UND-R14_UND // 164-165 -- R13_SVC-R14_SVC // 166-191 -- Reserved UNW_ARM_WC0 = 192, UNW_ARM_WC1 = 193, UNW_ARM_WC2 = 194, UNW_ARM_WC3 = 195, // 196-199 -- wC4-wC7 (Intel wireless MMX control) // 200-255 -- Reserved UNW_ARM_D0 = 256, UNW_ARM_D1 = 257, UNW_ARM_D2 = 258, UNW_ARM_D3 = 259, UNW_ARM_D4 = 260, UNW_ARM_D5 = 261, UNW_ARM_D6 = 262, UNW_ARM_D7 = 263, UNW_ARM_D8 = 264, UNW_ARM_D9 = 265, UNW_ARM_D10 = 266, UNW_ARM_D11 = 267, UNW_ARM_D12 = 268, UNW_ARM_D13 = 269, UNW_ARM_D14 = 270, UNW_ARM_D15 = 271, UNW_ARM_D16 = 272, UNW_ARM_D17 = 273, UNW_ARM_D18 = 274, UNW_ARM_D19 = 275, UNW_ARM_D20 = 276, UNW_ARM_D21 = 277, UNW_ARM_D22 = 278, UNW_ARM_D23 = 279, UNW_ARM_D24 = 280, UNW_ARM_D25 = 281, UNW_ARM_D26 = 282, UNW_ARM_D27 = 283, UNW_ARM_D28 = 284, UNW_ARM_D29 = 285, UNW_ARM_D30 = 286, UNW_ARM_D31 = 287, // 288-319 -- Reserved for VFP/Neon // 320-8191 -- Reserved // 8192-16383 -- Unspecified vendor co-processor register. }; // OpenRISC1000 register numbers enum { UNW_OR1K_R0 = 0, UNW_OR1K_R1 = 1, UNW_OR1K_R2 = 2, UNW_OR1K_R3 = 3, UNW_OR1K_R4 = 4, UNW_OR1K_R5 = 5, UNW_OR1K_R6 = 6, UNW_OR1K_R7 = 7, UNW_OR1K_R8 = 8, UNW_OR1K_R9 = 9, UNW_OR1K_R10 = 10, UNW_OR1K_R11 = 11, UNW_OR1K_R12 = 12, UNW_OR1K_R13 = 13, UNW_OR1K_R14 = 14, UNW_OR1K_R15 = 15, UNW_OR1K_R16 = 16, UNW_OR1K_R17 = 17, UNW_OR1K_R18 = 18, UNW_OR1K_R19 = 19, UNW_OR1K_R20 = 20, UNW_OR1K_R21 = 21, UNW_OR1K_R22 = 22, UNW_OR1K_R23 = 23, UNW_OR1K_R24 = 24, UNW_OR1K_R25 = 25, UNW_OR1K_R26 = 26, UNW_OR1K_R27 = 27, UNW_OR1K_R28 = 28, UNW_OR1K_R29 = 29, UNW_OR1K_R30 = 30, UNW_OR1K_R31 = 31, }; // 64-bit RISC-V registers enum { UNW_RISCV_X0 = 0, UNW_RISCV_X1 = 1, UNW_RISCV_RA = 1, UNW_RISCV_X2 = 2, UNW_RISCV_SP = 2, UNW_RISCV_X3 = 3, UNW_RISCV_X4 = 4, UNW_RISCV_X5 = 5, UNW_RISCV_X6 = 6, UNW_RISCV_X7 = 7, UNW_RISCV_X8 = 8, UNW_RISCV_X9 = 9, UNW_RISCV_X10 = 10, UNW_RISCV_X11 = 11, UNW_RISCV_X12 = 12, UNW_RISCV_X13 = 13, UNW_RISCV_X14 = 14, UNW_RISCV_X15 = 15, UNW_RISCV_X16 = 16, UNW_RISCV_X17 = 17, UNW_RISCV_X18 = 18, UNW_RISCV_X19 = 19, UNW_RISCV_X20 = 20, UNW_RISCV_X21 = 21, UNW_RISCV_X22 = 22, UNW_RISCV_X23 = 23, UNW_RISCV_X24 = 24, UNW_RISCV_X25 = 25, UNW_RISCV_X26 = 26, UNW_RISCV_X27 = 27, UNW_RISCV_X28 = 28, UNW_RISCV_X29 = 29, UNW_RISCV_X30 = 30, UNW_RISCV_X31 = 31, // reserved block UNW_RISCV_D0 = 64, UNW_RISCV_D1 = 65, UNW_RISCV_D2 = 66, UNW_RISCV_D3 = 67, UNW_RISCV_D4 = 68, UNW_RISCV_D5 = 69, UNW_RISCV_D6 = 70, UNW_RISCV_D7 = 71, UNW_RISCV_D8 = 72, UNW_RISCV_D9 = 73, UNW_RISCV_D10 = 74, UNW_RISCV_D11 = 75, UNW_RISCV_D12 = 76, UNW_RISCV_D13 = 77, UNW_RISCV_D14 = 78, UNW_RISCV_D15 = 79, UNW_RISCV_D16 = 80, UNW_RISCV_D17 = 81, UNW_RISCV_D18 = 82, UNW_RISCV_D19 = 83, UNW_RISCV_D20 = 84, UNW_RISCV_D21 = 85, UNW_RISCV_D22 = 86, UNW_RISCV_D23 = 87, UNW_RISCV_D24 = 88, UNW_RISCV_D25 = 89, UNW_RISCV_D26 = 90, UNW_RISCV_D27 = 91, UNW_RISCV_D28 = 92, UNW_RISCV_D29 = 93, UNW_RISCV_D30 = 94, UNW_RISCV_D31 = 95, }; #endif Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/AddressSpace.hpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/AddressSpace.hpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/AddressSpace.hpp (revision 308054) @@ -1,599 +1,599 @@ //===------------------------- AddressSpace.hpp ---------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Abstracts accessing local vs remote address spaces. // //===----------------------------------------------------------------------===// #ifndef __ADDRESSSPACE_HPP__ #define __ADDRESSSPACE_HPP__ #include #include #include #include #ifndef _LIBUNWIND_IS_BAREMETAL #include #endif #ifdef __APPLE__ #include namespace libunwind { bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde); } #endif #include "libunwind.h" #include "config.h" #include "dwarf2.h" #include "Registers.hpp" #if _LIBUNWIND_ARM_EHABI #if defined(__FreeBSD__) || defined(__NetBSD__) #include typedef void *_Unwind_Ptr; #elif defined(__linux__) typedef long unsigned int *_Unwind_Ptr; extern "C" _Unwind_Ptr __gnu_Unwind_Find_exidx(_Unwind_Ptr addr, int *len); // Emulate the BSD dl_unwind_find_exidx API when on a GNU libdl system. #define dl_unwind_find_exidx __gnu_Unwind_Find_exidx #elif !defined(_LIBUNWIND_IS_BAREMETAL) #include #else // !defined(_LIBUNWIND_IS_BAREMETAL) // When statically linked on bare-metal, the symbols for the EH table are looked // up without going through the dynamic loader. struct EHTEntry { uint32_t functionOffset; uint32_t unwindOpcodes; }; extern EHTEntry __exidx_start; extern EHTEntry __exidx_end; #endif // !defined(_LIBUNWIND_IS_BAREMETAL) #endif // _LIBUNWIND_ARM_EHABI #if defined(__CloudABI__) || defined(__FreeBSD__) || defined(__linux__) || \ defined(__NetBSD__) #if _LIBUNWIND_SUPPORT_DWARF_UNWIND && _LIBUNWIND_SUPPORT_DWARF_INDEX #include // Macro for machine-independent access to the ELF program headers. This // macro is not available on some systems (e.g., FreeBSD). On these // systems the data structures are just called Elf_XXX. Define ElfW() // locally. #if !defined(ElfW) #define ElfW(type) Elf_##type #endif #include "EHHeaderParser.hpp" #endif #endif namespace libunwind { /// Used by findUnwindSections() to return info about needed sections. struct UnwindInfoSections { #if _LIBUNWIND_SUPPORT_DWARF_UNWIND || _LIBUNWIND_SUPPORT_DWARF_INDEX || \ _LIBUNWIND_SUPPORT_COMPACT_UNWIND // No dso_base for ARM EHABI. uintptr_t dso_base; #endif #if _LIBUNWIND_SUPPORT_DWARF_UNWIND uintptr_t dwarf_section; uintptr_t dwarf_section_length; #endif #if _LIBUNWIND_SUPPORT_DWARF_INDEX uintptr_t dwarf_index_section; uintptr_t dwarf_index_section_length; #endif #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND uintptr_t compact_unwind_section; uintptr_t compact_unwind_section_length; #endif #if _LIBUNWIND_ARM_EHABI uintptr_t arm_section; uintptr_t arm_section_length; #endif }; /// LocalAddressSpace is used as a template parameter to UnwindCursor when /// unwinding a thread in the same process. The wrappers compile away, /// making local unwinds fast. class __attribute__((visibility("hidden"))) LocalAddressSpace { public: #ifdef __LP64__ typedef uint64_t pint_t; typedef int64_t sint_t; #else typedef uint32_t pint_t; typedef int32_t sint_t; #endif uint8_t get8(pint_t addr) { uint8_t val; memcpy(&val, (void *)addr, sizeof(val)); return val; } uint16_t get16(pint_t addr) { uint16_t val; memcpy(&val, (void *)addr, sizeof(val)); return val; } uint32_t get32(pint_t addr) { uint32_t val; memcpy(&val, (void *)addr, sizeof(val)); return val; } uint64_t get64(pint_t addr) { uint64_t val; memcpy(&val, (void *)addr, sizeof(val)); return val; } double getDouble(pint_t addr) { double val; memcpy(&val, (void *)addr, sizeof(val)); return val; } v128 getVector(pint_t addr) { v128 val; memcpy(&val, (void *)addr, sizeof(val)); return val; } uintptr_t getP(pint_t addr); static uint64_t getULEB128(pint_t &addr, pint_t end); static int64_t getSLEB128(pint_t &addr, pint_t end); pint_t getEncodedP(pint_t &addr, pint_t end, uint8_t encoding, pint_t datarelBase = 0); bool findFunctionName(pint_t addr, char *buf, size_t bufLen, unw_word_t *offset); bool findUnwindSections(pint_t targetAddr, UnwindInfoSections &info); bool findOtherFDE(pint_t targetAddr, pint_t &fde); static LocalAddressSpace sThisAddressSpace; }; inline uintptr_t LocalAddressSpace::getP(pint_t addr) { #ifdef __LP64__ return get64(addr); #else return get32(addr); #endif } /// Read a ULEB128 into a 64-bit word. inline uint64_t LocalAddressSpace::getULEB128(pint_t &addr, pint_t end) { const uint8_t *p = (uint8_t *)addr; const uint8_t *pend = (uint8_t *)end; uint64_t result = 0; int bit = 0; do { uint64_t b; if (p == pend) _LIBUNWIND_ABORT("truncated uleb128 expression"); b = *p & 0x7f; if (bit >= 64 || b << bit >> bit != b) { _LIBUNWIND_ABORT("malformed uleb128 expression"); } else { result |= b << bit; bit += 7; } } while (*p++ >= 0x80); addr = (pint_t) p; return result; } /// Read a SLEB128 into a 64-bit word. inline int64_t LocalAddressSpace::getSLEB128(pint_t &addr, pint_t end) { const uint8_t *p = (uint8_t *)addr; const uint8_t *pend = (uint8_t *)end; int64_t result = 0; int bit = 0; uint8_t byte; do { if (p == pend) _LIBUNWIND_ABORT("truncated sleb128 expression"); byte = *p++; result |= ((byte & 0x7f) << bit); bit += 7; } while (byte & 0x80); // sign extend negative numbers if ((byte & 0x40) != 0) result |= (-1LL) << bit; addr = (pint_t) p; return result; } inline LocalAddressSpace::pint_t LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding, pint_t datarelBase) { pint_t startAddr = addr; const uint8_t *p = (uint8_t *)addr; pint_t result; // first get value switch (encoding & 0x0F) { case DW_EH_PE_ptr: result = getP(addr); p += sizeof(pint_t); addr = (pint_t) p; break; case DW_EH_PE_uleb128: result = (pint_t)getULEB128(addr, end); break; case DW_EH_PE_udata2: result = get16(addr); p += 2; addr = (pint_t) p; break; case DW_EH_PE_udata4: result = get32(addr); p += 4; addr = (pint_t) p; break; case DW_EH_PE_udata8: result = (pint_t)get64(addr); p += 8; addr = (pint_t) p; break; case DW_EH_PE_sleb128: result = (pint_t)getSLEB128(addr, end); break; case DW_EH_PE_sdata2: // Sign extend from signed 16-bit value. result = (pint_t)(int16_t)get16(addr); p += 2; addr = (pint_t) p; break; case DW_EH_PE_sdata4: // Sign extend from signed 32-bit value. result = (pint_t)(int32_t)get32(addr); p += 4; addr = (pint_t) p; break; case DW_EH_PE_sdata8: result = (pint_t)get64(addr); p += 8; addr = (pint_t) p; break; default: _LIBUNWIND_ABORT("unknown pointer encoding"); } // then add relative offset switch (encoding & 0x70) { case DW_EH_PE_absptr: // do nothing break; case DW_EH_PE_pcrel: result += startAddr; break; case DW_EH_PE_textrel: _LIBUNWIND_ABORT("DW_EH_PE_textrel pointer encoding not supported"); break; case DW_EH_PE_datarel: // DW_EH_PE_datarel is only valid in a few places, so the parameter has a // default value of 0, and we abort in the event that someone calls this // function with a datarelBase of 0 and DW_EH_PE_datarel encoding. if (datarelBase == 0) _LIBUNWIND_ABORT("DW_EH_PE_datarel is invalid with a datarelBase of 0"); result += datarelBase; break; case DW_EH_PE_funcrel: _LIBUNWIND_ABORT("DW_EH_PE_funcrel pointer encoding not supported"); break; case DW_EH_PE_aligned: _LIBUNWIND_ABORT("DW_EH_PE_aligned pointer encoding not supported"); break; default: _LIBUNWIND_ABORT("unknown pointer encoding"); break; } if (encoding & DW_EH_PE_indirect) result = getP(result); return result; } #ifdef __APPLE__ struct dyld_unwind_sections { const struct mach_header* mh; const void* dwarf_section; uintptr_t dwarf_section_length; const void* compact_unwind_section; uintptr_t compact_unwind_section_length; }; #if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) \ && (__MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)) \ || defined(__IPHONE_OS_VERSION_MIN_REQUIRED) // In 10.7.0 or later, libSystem.dylib implements this function. extern "C" bool _dyld_find_unwind_sections(void *, dyld_unwind_sections *); #else // In 10.6.x and earlier, we need to implement this functionality. static inline bool _dyld_find_unwind_sections(void* addr, dyld_unwind_sections* info) { // Find mach-o image containing address. Dl_info dlinfo; if (!dladdr(addr, &dlinfo)) return false; const mach_header *mh = (const mach_header *)dlinfo.dli_saddr; // Find dwarf unwind section in that image. unsigned long size; const uint8_t *p = getsectiondata(mh, "__TEXT", "__eh_frame", &size); if (!p) return false; // Fill in return struct. info->mh = mh; info->dwarf_section = p; info->dwarf_section_length = size; info->compact_unwind_section = 0; info->compact_unwind_section_length = 0; return true; } #endif #endif inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, UnwindInfoSections &info) { #ifdef __APPLE__ dyld_unwind_sections dyldInfo; if (_dyld_find_unwind_sections((void *)targetAddr, &dyldInfo)) { info.dso_base = (uintptr_t)dyldInfo.mh; #if _LIBUNWIND_SUPPORT_DWARF_UNWIND info.dwarf_section = (uintptr_t)dyldInfo.dwarf_section; info.dwarf_section_length = dyldInfo.dwarf_section_length; #endif info.compact_unwind_section = (uintptr_t)dyldInfo.compact_unwind_section; info.compact_unwind_section_length = dyldInfo.compact_unwind_section_length; return true; } #elif _LIBUNWIND_ARM_EHABI #ifdef _LIBUNWIND_IS_BAREMETAL // Bare metal is statically linked, so no need to ask the dynamic loader info.arm_section = (uintptr_t)(&__exidx_start); info.arm_section_length = (uintptr_t)(&__exidx_end - &__exidx_start); #else int length = 0; info.arm_section = (uintptr_t) dl_unwind_find_exidx( (_Unwind_Ptr) targetAddr, &length); info.arm_section_length = (uintptr_t)length; #endif - _LIBUNWIND_TRACE_UNWINDING("findUnwindSections: section %X length %x\n", + _LIBUNWIND_TRACE_UNWINDING("findUnwindSections: section %X length %x", info.arm_section, info.arm_section_length); if (info.arm_section && info.arm_section_length) return true; #elif _LIBUNWIND_SUPPORT_DWARF_UNWIND #if _LIBUNWIND_SUPPORT_DWARF_INDEX struct dl_iterate_cb_data { LocalAddressSpace *addressSpace; UnwindInfoSections *sects; uintptr_t targetAddr; }; dl_iterate_cb_data cb_data = {this, &info, targetAddr}; int found = dl_iterate_phdr( [](struct dl_phdr_info *pinfo, size_t, void *data) -> int { auto cbdata = static_cast(data); size_t object_length; bool found_obj = false; bool found_hdr = false; assert(cbdata); assert(cbdata->sects); if (cbdata->targetAddr < pinfo->dlpi_addr) { return false; } #if !defined(Elf_Half) typedef ElfW(Half) Elf_Half; #endif #if !defined(Elf_Phdr) typedef ElfW(Phdr) Elf_Phdr; #endif for (Elf_Half i = 0; i < pinfo->dlpi_phnum; i++) { const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i]; if (phdr->p_type == PT_LOAD) { uintptr_t begin = pinfo->dlpi_addr + phdr->p_vaddr; uintptr_t end = begin + phdr->p_memsz; if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) { cbdata->sects->dso_base = begin; object_length = phdr->p_memsz; found_obj = true; } } else if (phdr->p_type == PT_GNU_EH_FRAME) { EHHeaderParser::EHHeaderInfo hdrInfo; uintptr_t eh_frame_hdr_start = pinfo->dlpi_addr + phdr->p_vaddr; cbdata->sects->dwarf_index_section = eh_frame_hdr_start; cbdata->sects->dwarf_index_section_length = phdr->p_memsz; EHHeaderParser::decodeEHHdr( *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz, hdrInfo); cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr; found_hdr = true; } } if (found_obj && found_hdr) { cbdata->sects->dwarf_section_length = object_length; return true; } else { return false; } }, &cb_data); return static_cast(found); #else #error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform." #endif #endif return false; } inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) { #ifdef __APPLE__ return checkKeyMgrRegisteredFDEs(targetAddr, *((void**)&fde)); #else // TO DO: if OS has way to dynamically register FDEs, check that. (void)targetAddr; (void)fde; return false; #endif } inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf, size_t bufLen, unw_word_t *offset) { #ifndef _LIBUNWIND_IS_BAREMETAL Dl_info dyldInfo; if (dladdr((void *)addr, &dyldInfo)) { if (dyldInfo.dli_sname != NULL) { snprintf(buf, bufLen, "%s", dyldInfo.dli_sname); *offset = (addr - (pint_t) dyldInfo.dli_saddr); return true; } } #endif return false; } #ifdef UNW_REMOTE /// OtherAddressSpace is used as a template parameter to UnwindCursor when /// unwinding a thread in the another process. The other process can be a /// different endianness and a different pointer size which is handled by /// the P template parameter. template class OtherAddressSpace { public: OtherAddressSpace(task_t task) : fTask(task) {} typedef typename P::uint_t pint_t; uint8_t get8(pint_t addr); uint16_t get16(pint_t addr); uint32_t get32(pint_t addr); uint64_t get64(pint_t addr); pint_t getP(pint_t addr); uint64_t getULEB128(pint_t &addr, pint_t end); int64_t getSLEB128(pint_t &addr, pint_t end); pint_t getEncodedP(pint_t &addr, pint_t end, uint8_t encoding, pint_t datarelBase = 0); bool findFunctionName(pint_t addr, char *buf, size_t bufLen, unw_word_t *offset); bool findUnwindSections(pint_t targetAddr, UnwindInfoSections &info); bool findOtherFDE(pint_t targetAddr, pint_t &fde); private: void *localCopy(pint_t addr); task_t fTask; }; template uint8_t OtherAddressSpace

::get8(pint_t addr) { return *((uint8_t *)localCopy(addr)); } template uint16_t OtherAddressSpace

::get16(pint_t addr) { return P::E::get16(*(uint16_t *)localCopy(addr)); } template uint32_t OtherAddressSpace

::get32(pint_t addr) { return P::E::get32(*(uint32_t *)localCopy(addr)); } template uint64_t OtherAddressSpace

::get64(pint_t addr) { return P::E::get64(*(uint64_t *)localCopy(addr)); } template typename P::uint_t OtherAddressSpace

::getP(pint_t addr) { return P::getP(*(uint64_t *)localCopy(addr)); } template uint64_t OtherAddressSpace

::getULEB128(pint_t &addr, pint_t end) { uintptr_t size = (end - addr); LocalAddressSpace::pint_t laddr = (LocalAddressSpace::pint_t) localCopy(addr); LocalAddressSpace::pint_t sladdr = laddr; uint64_t result = LocalAddressSpace::getULEB128(laddr, laddr + size); addr += (laddr - sladdr); return result; } template int64_t OtherAddressSpace

::getSLEB128(pint_t &addr, pint_t end) { uintptr_t size = (end - addr); LocalAddressSpace::pint_t laddr = (LocalAddressSpace::pint_t) localCopy(addr); LocalAddressSpace::pint_t sladdr = laddr; uint64_t result = LocalAddressSpace::getSLEB128(laddr, laddr + size); addr += (laddr - sladdr); return result; } template void *OtherAddressSpace

::localCopy(pint_t addr) { // FIX ME } template bool OtherAddressSpace

::findFunctionName(pint_t addr, char *buf, size_t bufLen, unw_word_t *offset) { // FIX ME } /// unw_addr_space is the base class that abstract unw_addr_space_t type in /// libunwind.h points to. struct unw_addr_space { cpu_type_t cpuType; task_t taskPort; }; /// unw_addr_space_i386 is the concrete instance that a unw_addr_space_t points /// to when examining /// a 32-bit intel process. struct unw_addr_space_i386 : public unw_addr_space { unw_addr_space_i386(task_t task) : oas(task) {} OtherAddressSpace > oas; }; /// unw_addr_space_x86_64 is the concrete instance that a unw_addr_space_t /// points to when examining /// a 64-bit intel process. struct unw_addr_space_x86_64 : public unw_addr_space { unw_addr_space_x86_64(task_t task) : oas(task) {} OtherAddressSpace > oas; }; /// unw_addr_space_ppc is the concrete instance that a unw_addr_space_t points /// to when examining /// a 32-bit PowerPC process. struct unw_addr_space_ppc : public unw_addr_space { unw_addr_space_ppc(task_t task) : oas(task) {} OtherAddressSpace > oas; }; #endif // UNW_REMOTE } // namespace libunwind #endif // __ADDRESSSPACE_HPP__ Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/CompactUnwinder.hpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/CompactUnwinder.hpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/CompactUnwinder.hpp (revision 308054) @@ -1,699 +1,699 @@ //===-------------------------- CompactUnwinder.hpp -----------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Does runtime stack unwinding using compact unwind encodings. // //===----------------------------------------------------------------------===// #ifndef __COMPACT_UNWINDER_HPP__ #define __COMPACT_UNWINDER_HPP__ #include #include #include #include #include "AddressSpace.hpp" #include "Registers.hpp" #define EXTRACT_BITS(value, mask) \ ((value >> __builtin_ctz(mask)) & (((1 << __builtin_popcount(mask))) - 1)) namespace libunwind { #if defined(_LIBUNWIND_TARGET_I386) /// CompactUnwinder_x86 uses a compact unwind info to virtually "step" (aka /// unwind) by modifying a Registers_x86 register set template class CompactUnwinder_x86 { public: static int stepWithCompactEncoding(compact_unwind_encoding_t info, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters); private: typename A::pint_t pint_t; static void frameUnwind(A &addressSpace, Registers_x86 ®isters); static void framelessUnwind(A &addressSpace, typename A::pint_t returnAddressLocation, Registers_x86 ®isters); static int stepWithCompactEncodingEBPFrame(compact_unwind_encoding_t compactEncoding, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters); static int stepWithCompactEncodingFrameless( compact_unwind_encoding_t compactEncoding, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters, bool indirectStackSize); }; template int CompactUnwinder_x86::stepWithCompactEncoding( compact_unwind_encoding_t compactEncoding, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters) { switch (compactEncoding & UNWIND_X86_MODE_MASK) { case UNWIND_X86_MODE_EBP_FRAME: return stepWithCompactEncodingEBPFrame(compactEncoding, functionStart, addressSpace, registers); case UNWIND_X86_MODE_STACK_IMMD: return stepWithCompactEncodingFrameless(compactEncoding, functionStart, addressSpace, registers, false); case UNWIND_X86_MODE_STACK_IND: return stepWithCompactEncodingFrameless(compactEncoding, functionStart, addressSpace, registers, true); } _LIBUNWIND_ABORT("invalid compact unwind encoding"); } template int CompactUnwinder_x86::stepWithCompactEncodingEBPFrame( compact_unwind_encoding_t compactEncoding, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters) { uint32_t savedRegistersOffset = EXTRACT_BITS(compactEncoding, UNWIND_X86_EBP_FRAME_OFFSET); uint32_t savedRegistersLocations = EXTRACT_BITS(compactEncoding, UNWIND_X86_EBP_FRAME_REGISTERS); uint32_t savedRegisters = registers.getEBP() - 4 * savedRegistersOffset; for (int i = 0; i < 5; ++i) { switch (savedRegistersLocations & 0x7) { case UNWIND_X86_REG_NONE: // no register saved in this slot break; case UNWIND_X86_REG_EBX: registers.setEBX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_ECX: registers.setECX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_EDX: registers.setEDX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_EDI: registers.setEDI(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_ESI: registers.setESI(addressSpace.get32(savedRegisters)); break; default: (void)functionStart; _LIBUNWIND_DEBUG_LOG("bad register for EBP frame, encoding=%08X for " - "function starting at 0x%X\n", + "function starting at 0x%X", compactEncoding, functionStart); _LIBUNWIND_ABORT("invalid compact unwind encoding"); } savedRegisters += 4; savedRegistersLocations = (savedRegistersLocations >> 3); } frameUnwind(addressSpace, registers); return UNW_STEP_SUCCESS; } template int CompactUnwinder_x86::stepWithCompactEncodingFrameless( compact_unwind_encoding_t encoding, uint32_t functionStart, A &addressSpace, Registers_x86 ®isters, bool indirectStackSize) { uint32_t stackSizeEncoded = EXTRACT_BITS(encoding, UNWIND_X86_FRAMELESS_STACK_SIZE); uint32_t stackAdjust = EXTRACT_BITS(encoding, UNWIND_X86_FRAMELESS_STACK_ADJUST); uint32_t regCount = EXTRACT_BITS(encoding, UNWIND_X86_FRAMELESS_STACK_REG_COUNT); uint32_t permutation = EXTRACT_BITS(encoding, UNWIND_X86_FRAMELESS_STACK_REG_PERMUTATION); uint32_t stackSize = stackSizeEncoded * 4; if (indirectStackSize) { // stack size is encoded in subl $xxx,%esp instruction uint32_t subl = addressSpace.get32(functionStart + stackSizeEncoded); stackSize = subl + 4 * stackAdjust; } // decompress permutation uint32_t permunreg[6]; switch (regCount) { case 6: permunreg[0] = permutation / 120; permutation -= (permunreg[0] * 120); permunreg[1] = permutation / 24; permutation -= (permunreg[1] * 24); permunreg[2] = permutation / 6; permutation -= (permunreg[2] * 6); permunreg[3] = permutation / 2; permutation -= (permunreg[3] * 2); permunreg[4] = permutation; permunreg[5] = 0; break; case 5: permunreg[0] = permutation / 120; permutation -= (permunreg[0] * 120); permunreg[1] = permutation / 24; permutation -= (permunreg[1] * 24); permunreg[2] = permutation / 6; permutation -= (permunreg[2] * 6); permunreg[3] = permutation / 2; permutation -= (permunreg[3] * 2); permunreg[4] = permutation; break; case 4: permunreg[0] = permutation / 60; permutation -= (permunreg[0] * 60); permunreg[1] = permutation / 12; permutation -= (permunreg[1] * 12); permunreg[2] = permutation / 3; permutation -= (permunreg[2] * 3); permunreg[3] = permutation; break; case 3: permunreg[0] = permutation / 20; permutation -= (permunreg[0] * 20); permunreg[1] = permutation / 4; permutation -= (permunreg[1] * 4); permunreg[2] = permutation; break; case 2: permunreg[0] = permutation / 5; permutation -= (permunreg[0] * 5); permunreg[1] = permutation; break; case 1: permunreg[0] = permutation; break; } // re-number registers back to standard numbers int registersSaved[6]; bool used[7] = { false, false, false, false, false, false, false }; for (uint32_t i = 0; i < regCount; ++i) { uint32_t renum = 0; for (int u = 1; u < 7; ++u) { if (!used[u]) { if (renum == permunreg[i]) { registersSaved[i] = u; used[u] = true; break; } ++renum; } } } uint32_t savedRegisters = registers.getSP() + stackSize - 4 - 4 * regCount; for (uint32_t i = 0; i < regCount; ++i) { switch (registersSaved[i]) { case UNWIND_X86_REG_EBX: registers.setEBX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_ECX: registers.setECX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_EDX: registers.setEDX(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_EDI: registers.setEDI(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_ESI: registers.setESI(addressSpace.get32(savedRegisters)); break; case UNWIND_X86_REG_EBP: registers.setEBP(addressSpace.get32(savedRegisters)); break; default: _LIBUNWIND_DEBUG_LOG("bad register for frameless, encoding=%08X for " - "function starting at 0x%X\n", + "function starting at 0x%X", encoding, functionStart); _LIBUNWIND_ABORT("invalid compact unwind encoding"); } savedRegisters += 4; } framelessUnwind(addressSpace, savedRegisters, registers); return UNW_STEP_SUCCESS; } template void CompactUnwinder_x86::frameUnwind(A &addressSpace, Registers_x86 ®isters) { typename A::pint_t bp = registers.getEBP(); // ebp points to old ebp registers.setEBP(addressSpace.get32(bp)); // old esp is ebp less saved ebp and return address registers.setSP((uint32_t)bp + 8); // pop return address into eip registers.setIP(addressSpace.get32(bp + 4)); } template void CompactUnwinder_x86::framelessUnwind( A &addressSpace, typename A::pint_t returnAddressLocation, Registers_x86 ®isters) { // return address is on stack after last saved register registers.setIP(addressSpace.get32(returnAddressLocation)); // old esp is before return address registers.setSP((uint32_t)returnAddressLocation + 4); } #endif // _LIBUNWIND_TARGET_I386 #if defined(_LIBUNWIND_TARGET_X86_64) /// CompactUnwinder_x86_64 uses a compact unwind info to virtually "step" (aka /// unwind) by modifying a Registers_x86_64 register set template class CompactUnwinder_x86_64 { public: static int stepWithCompactEncoding(compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters); private: typename A::pint_t pint_t; static void frameUnwind(A &addressSpace, Registers_x86_64 ®isters); static void framelessUnwind(A &addressSpace, uint64_t returnAddressLocation, Registers_x86_64 ®isters); static int stepWithCompactEncodingRBPFrame(compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters); static int stepWithCompactEncodingFrameless( compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters, bool indirectStackSize); }; template int CompactUnwinder_x86_64::stepWithCompactEncoding( compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters) { switch (compactEncoding & UNWIND_X86_64_MODE_MASK) { case UNWIND_X86_64_MODE_RBP_FRAME: return stepWithCompactEncodingRBPFrame(compactEncoding, functionStart, addressSpace, registers); case UNWIND_X86_64_MODE_STACK_IMMD: return stepWithCompactEncodingFrameless(compactEncoding, functionStart, addressSpace, registers, false); case UNWIND_X86_64_MODE_STACK_IND: return stepWithCompactEncodingFrameless(compactEncoding, functionStart, addressSpace, registers, true); } _LIBUNWIND_ABORT("invalid compact unwind encoding"); } template int CompactUnwinder_x86_64::stepWithCompactEncodingRBPFrame( compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters) { uint32_t savedRegistersOffset = EXTRACT_BITS(compactEncoding, UNWIND_X86_64_RBP_FRAME_OFFSET); uint32_t savedRegistersLocations = EXTRACT_BITS(compactEncoding, UNWIND_X86_64_RBP_FRAME_REGISTERS); uint64_t savedRegisters = registers.getRBP() - 8 * savedRegistersOffset; for (int i = 0; i < 5; ++i) { switch (savedRegistersLocations & 0x7) { case UNWIND_X86_64_REG_NONE: // no register saved in this slot break; case UNWIND_X86_64_REG_RBX: registers.setRBX(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R12: registers.setR12(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R13: registers.setR13(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R14: registers.setR14(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R15: registers.setR15(addressSpace.get64(savedRegisters)); break; default: (void)functionStart; _LIBUNWIND_DEBUG_LOG("bad register for RBP frame, encoding=%08X for " - "function starting at 0x%llX\n", + "function starting at 0x%llX", compactEncoding, functionStart); _LIBUNWIND_ABORT("invalid compact unwind encoding"); } savedRegisters += 8; savedRegistersLocations = (savedRegistersLocations >> 3); } frameUnwind(addressSpace, registers); return UNW_STEP_SUCCESS; } template int CompactUnwinder_x86_64::stepWithCompactEncodingFrameless( compact_unwind_encoding_t encoding, uint64_t functionStart, A &addressSpace, Registers_x86_64 ®isters, bool indirectStackSize) { uint32_t stackSizeEncoded = EXTRACT_BITS(encoding, UNWIND_X86_64_FRAMELESS_STACK_SIZE); uint32_t stackAdjust = EXTRACT_BITS(encoding, UNWIND_X86_64_FRAMELESS_STACK_ADJUST); uint32_t regCount = EXTRACT_BITS(encoding, UNWIND_X86_64_FRAMELESS_STACK_REG_COUNT); uint32_t permutation = EXTRACT_BITS(encoding, UNWIND_X86_64_FRAMELESS_STACK_REG_PERMUTATION); uint32_t stackSize = stackSizeEncoded * 8; if (indirectStackSize) { // stack size is encoded in subl $xxx,%esp instruction uint32_t subl = addressSpace.get32(functionStart + stackSizeEncoded); stackSize = subl + 8 * stackAdjust; } // decompress permutation uint32_t permunreg[6]; switch (regCount) { case 6: permunreg[0] = permutation / 120; permutation -= (permunreg[0] * 120); permunreg[1] = permutation / 24; permutation -= (permunreg[1] * 24); permunreg[2] = permutation / 6; permutation -= (permunreg[2] * 6); permunreg[3] = permutation / 2; permutation -= (permunreg[3] * 2); permunreg[4] = permutation; permunreg[5] = 0; break; case 5: permunreg[0] = permutation / 120; permutation -= (permunreg[0] * 120); permunreg[1] = permutation / 24; permutation -= (permunreg[1] * 24); permunreg[2] = permutation / 6; permutation -= (permunreg[2] * 6); permunreg[3] = permutation / 2; permutation -= (permunreg[3] * 2); permunreg[4] = permutation; break; case 4: permunreg[0] = permutation / 60; permutation -= (permunreg[0] * 60); permunreg[1] = permutation / 12; permutation -= (permunreg[1] * 12); permunreg[2] = permutation / 3; permutation -= (permunreg[2] * 3); permunreg[3] = permutation; break; case 3: permunreg[0] = permutation / 20; permutation -= (permunreg[0] * 20); permunreg[1] = permutation / 4; permutation -= (permunreg[1] * 4); permunreg[2] = permutation; break; case 2: permunreg[0] = permutation / 5; permutation -= (permunreg[0] * 5); permunreg[1] = permutation; break; case 1: permunreg[0] = permutation; break; } // re-number registers back to standard numbers int registersSaved[6]; bool used[7] = { false, false, false, false, false, false, false }; for (uint32_t i = 0; i < regCount; ++i) { uint32_t renum = 0; for (int u = 1; u < 7; ++u) { if (!used[u]) { if (renum == permunreg[i]) { registersSaved[i] = u; used[u] = true; break; } ++renum; } } } uint64_t savedRegisters = registers.getSP() + stackSize - 8 - 8 * regCount; for (uint32_t i = 0; i < regCount; ++i) { switch (registersSaved[i]) { case UNWIND_X86_64_REG_RBX: registers.setRBX(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R12: registers.setR12(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R13: registers.setR13(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R14: registers.setR14(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_R15: registers.setR15(addressSpace.get64(savedRegisters)); break; case UNWIND_X86_64_REG_RBP: registers.setRBP(addressSpace.get64(savedRegisters)); break; default: _LIBUNWIND_DEBUG_LOG("bad register for frameless, encoding=%08X for " - "function starting at 0x%llX\n", + "function starting at 0x%llX", encoding, functionStart); _LIBUNWIND_ABORT("invalid compact unwind encoding"); } savedRegisters += 8; } framelessUnwind(addressSpace, savedRegisters, registers); return UNW_STEP_SUCCESS; } template void CompactUnwinder_x86_64::frameUnwind(A &addressSpace, Registers_x86_64 ®isters) { uint64_t rbp = registers.getRBP(); // ebp points to old ebp registers.setRBP(addressSpace.get64(rbp)); // old esp is ebp less saved ebp and return address registers.setSP(rbp + 16); // pop return address into eip registers.setIP(addressSpace.get64(rbp + 8)); } template void CompactUnwinder_x86_64::framelessUnwind(A &addressSpace, uint64_t returnAddressLocation, Registers_x86_64 ®isters) { // return address is on stack after last saved register registers.setIP(addressSpace.get64(returnAddressLocation)); // old esp is before return address registers.setSP(returnAddressLocation + 8); } #endif // _LIBUNWIND_TARGET_X86_64 #if defined(_LIBUNWIND_TARGET_AARCH64) /// CompactUnwinder_arm64 uses a compact unwind info to virtually "step" (aka /// unwind) by modifying a Registers_arm64 register set template class CompactUnwinder_arm64 { public: static int stepWithCompactEncoding(compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_arm64 ®isters); private: typename A::pint_t pint_t; static int stepWithCompactEncodingFrame(compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_arm64 ®isters); static int stepWithCompactEncodingFrameless( compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_arm64 ®isters); }; template int CompactUnwinder_arm64::stepWithCompactEncoding( compact_unwind_encoding_t compactEncoding, uint64_t functionStart, A &addressSpace, Registers_arm64 ®isters) { switch (compactEncoding & UNWIND_ARM64_MODE_MASK) { case UNWIND_ARM64_MODE_FRAME: return stepWithCompactEncodingFrame(compactEncoding, functionStart, addressSpace, registers); case UNWIND_ARM64_MODE_FRAMELESS: return stepWithCompactEncodingFrameless(compactEncoding, functionStart, addressSpace, registers); } _LIBUNWIND_ABORT("invalid compact unwind encoding"); } template int CompactUnwinder_arm64::stepWithCompactEncodingFrameless( compact_unwind_encoding_t encoding, uint64_t, A &addressSpace, Registers_arm64 ®isters) { uint32_t stackSize = 16 * EXTRACT_BITS(encoding, UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK); uint64_t savedRegisterLoc = registers.getSP() + stackSize; if (encoding & UNWIND_ARM64_FRAME_X19_X20_PAIR) { registers.setRegister(UNW_ARM64_X19, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X20, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X21_X22_PAIR) { registers.setRegister(UNW_ARM64_X21, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X22, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X23_X24_PAIR) { registers.setRegister(UNW_ARM64_X23, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X24, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X25_X26_PAIR) { registers.setRegister(UNW_ARM64_X25, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X26, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X27_X28_PAIR) { registers.setRegister(UNW_ARM64_X27, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X28, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D8_D9_PAIR) { registers.setFloatRegister(UNW_ARM64_D8, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D9, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D10_D11_PAIR) { registers.setFloatRegister(UNW_ARM64_D10, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D11, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D12_D13_PAIR) { registers.setFloatRegister(UNW_ARM64_D12, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D13, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D14_D15_PAIR) { registers.setFloatRegister(UNW_ARM64_D14, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D15, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } // subtract stack size off of sp registers.setSP(savedRegisterLoc); // set pc to be value in lr registers.setIP(registers.getRegister(UNW_ARM64_LR)); return UNW_STEP_SUCCESS; } template int CompactUnwinder_arm64::stepWithCompactEncodingFrame( compact_unwind_encoding_t encoding, uint64_t, A &addressSpace, Registers_arm64 ®isters) { uint64_t savedRegisterLoc = registers.getFP() - 8; if (encoding & UNWIND_ARM64_FRAME_X19_X20_PAIR) { registers.setRegister(UNW_ARM64_X19, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X20, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X21_X22_PAIR) { registers.setRegister(UNW_ARM64_X21, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X22, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X23_X24_PAIR) { registers.setRegister(UNW_ARM64_X23, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X24, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X25_X26_PAIR) { registers.setRegister(UNW_ARM64_X25, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X26, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_X27_X28_PAIR) { registers.setRegister(UNW_ARM64_X27, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setRegister(UNW_ARM64_X28, addressSpace.get64(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D8_D9_PAIR) { registers.setFloatRegister(UNW_ARM64_D8, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D9, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D10_D11_PAIR) { registers.setFloatRegister(UNW_ARM64_D10, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D11, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D12_D13_PAIR) { registers.setFloatRegister(UNW_ARM64_D12, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D13, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } if (encoding & UNWIND_ARM64_FRAME_D14_D15_PAIR) { registers.setFloatRegister(UNW_ARM64_D14, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; registers.setFloatRegister(UNW_ARM64_D15, addressSpace.getDouble(savedRegisterLoc)); savedRegisterLoc -= 8; } uint64_t fp = registers.getFP(); // fp points to old fp registers.setFP(addressSpace.get64(fp)); // old sp is fp less saved fp and lr registers.setSP(fp + 16); // pop return address into pc registers.setIP(addressSpace.get64(fp + 8)); return UNW_STEP_SUCCESS; } #endif // _LIBUNWIND_TARGET_AARCH64 } // namespace libunwind #endif // __COMPACT_UNWINDER_HPP__ Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/EHHeaderParser.hpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/EHHeaderParser.hpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/EHHeaderParser.hpp (revision 308054) @@ -1,161 +1,161 @@ //===------------------------- EHHeaderParser.hpp -------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Parses ELF .eh_frame_hdr sections. // //===----------------------------------------------------------------------===// #ifndef __EHHEADERPARSER_HPP__ #define __EHHEADERPARSER_HPP__ #include "libunwind.h" #include "AddressSpace.hpp" #include "DwarfParser.hpp" namespace libunwind { /// \brief EHHeaderParser does basic parsing of an ELF .eh_frame_hdr section. /// /// See DWARF spec for details: /// http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html /// template class EHHeaderParser { public: typedef typename A::pint_t pint_t; /// Information encoded in the EH frame header. struct EHHeaderInfo { pint_t eh_frame_ptr; size_t fde_count; pint_t table; uint8_t table_enc; }; static void decodeEHHdr(A &addressSpace, pint_t ehHdrStart, pint_t ehHdrEnd, EHHeaderInfo &ehHdrInfo); static bool findFDE(A &addressSpace, pint_t pc, pint_t ehHdrStart, uint32_t sectionLength, typename CFI_Parser::FDE_Info *fdeInfo, typename CFI_Parser::CIE_Info *cieInfo); private: static bool decodeTableEntry(A &addressSpace, pint_t &tableEntry, pint_t ehHdrStart, pint_t ehHdrEnd, uint8_t tableEnc, typename CFI_Parser::FDE_Info *fdeInfo, typename CFI_Parser::CIE_Info *cieInfo); static size_t getTableEntrySize(uint8_t tableEnc); }; template void EHHeaderParser::decodeEHHdr(A &addressSpace, pint_t ehHdrStart, pint_t ehHdrEnd, EHHeaderInfo &ehHdrInfo) { pint_t p = ehHdrStart; uint8_t version = addressSpace.get8(p++); if (version != 1) _LIBUNWIND_ABORT("Unsupported .eh_frame_hdr version"); uint8_t eh_frame_ptr_enc = addressSpace.get8(p++); uint8_t fde_count_enc = addressSpace.get8(p++); ehHdrInfo.table_enc = addressSpace.get8(p++); ehHdrInfo.eh_frame_ptr = addressSpace.getEncodedP(p, ehHdrEnd, eh_frame_ptr_enc, ehHdrStart); ehHdrInfo.fde_count = addressSpace.getEncodedP(p, ehHdrEnd, fde_count_enc, ehHdrStart); ehHdrInfo.table = p; } template bool EHHeaderParser::decodeTableEntry( A &addressSpace, pint_t &tableEntry, pint_t ehHdrStart, pint_t ehHdrEnd, uint8_t tableEnc, typename CFI_Parser::FDE_Info *fdeInfo, typename CFI_Parser::CIE_Info *cieInfo) { // Have to decode the whole FDE for the PC range anyway, so just throw away // the PC start. addressSpace.getEncodedP(tableEntry, ehHdrEnd, tableEnc, ehHdrStart); pint_t fde = addressSpace.getEncodedP(tableEntry, ehHdrEnd, tableEnc, ehHdrStart); const char *message = CFI_Parser::decodeFDE(addressSpace, fde, fdeInfo, cieInfo); if (message != NULL) { - _LIBUNWIND_DEBUG_LOG("EHHeaderParser::decodeTableEntry: bad fde: %s\n", + _LIBUNWIND_DEBUG_LOG("EHHeaderParser::decodeTableEntry: bad fde: %s", message); return false; } return true; } template bool EHHeaderParser::findFDE(A &addressSpace, pint_t pc, pint_t ehHdrStart, uint32_t sectionLength, typename CFI_Parser::FDE_Info *fdeInfo, typename CFI_Parser::CIE_Info *cieInfo) { pint_t ehHdrEnd = ehHdrStart + sectionLength; EHHeaderParser::EHHeaderInfo hdrInfo; EHHeaderParser::decodeEHHdr(addressSpace, ehHdrStart, ehHdrEnd, hdrInfo); size_t tableEntrySize = getTableEntrySize(hdrInfo.table_enc); pint_t tableEntry; size_t low = 0; for (size_t len = hdrInfo.fde_count; len > 1;) { size_t mid = low + (len / 2); tableEntry = hdrInfo.table + mid * tableEntrySize; pint_t start = addressSpace.getEncodedP(tableEntry, ehHdrEnd, hdrInfo.table_enc, ehHdrStart); if (start == pc) { low = mid; break; } else if (start < pc) { low = mid; len -= (len / 2); } else { len /= 2; } } tableEntry = hdrInfo.table + low * tableEntrySize; if (decodeTableEntry(addressSpace, tableEntry, ehHdrStart, ehHdrEnd, hdrInfo.table_enc, fdeInfo, cieInfo)) { if (pc >= fdeInfo->pcStart && pc < fdeInfo->pcEnd) return true; } return false; } template size_t EHHeaderParser::getTableEntrySize(uint8_t tableEnc) { switch (tableEnc & 0x0f) { case DW_EH_PE_sdata2: case DW_EH_PE_udata2: return 4; case DW_EH_PE_sdata4: case DW_EH_PE_udata4: return 8; case DW_EH_PE_sdata8: case DW_EH_PE_udata8: return 16; case DW_EH_PE_sleb128: case DW_EH_PE_uleb128: _LIBUNWIND_ABORT("Can't binary search on variable length encoded data."); case DW_EH_PE_omit: return 0; default: _LIBUNWIND_ABORT("Unknown DWARF encoding for search table."); } } } #endif Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-EHABI.cpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-EHABI.cpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-EHABI.cpp (revision 308054) @@ -1,977 +1,977 @@ //===--------------------------- Unwind-EHABI.cpp -------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Implements ARM zero-cost C++ exceptions // //===----------------------------------------------------------------------===// #include "Unwind-EHABI.h" #if _LIBUNWIND_ARM_EHABI #include #include #include #include #include #include #include "config.h" #include "libunwind.h" #include "libunwind_ext.h" #include "unwind.h" namespace { // Strange order: take words in order, but inside word, take from most to least // signinficant byte. uint8_t getByte(const uint32_t* data, size_t offset) { const uint8_t* byteData = reinterpret_cast(data); return byteData[(offset & ~(size_t)0x03) + (3 - (offset & (size_t)0x03))]; } const char* getNextWord(const char* data, uint32_t* out) { *out = *reinterpret_cast(data); return data + 4; } const char* getNextNibble(const char* data, uint32_t* out) { *out = *reinterpret_cast(data); return data + 2; } struct Descriptor { // See # 9.2 typedef enum { SU16 = 0, // Short descriptor, 16-bit entries LU16 = 1, // Long descriptor, 16-bit entries LU32 = 3, // Long descriptor, 32-bit entries RESERVED0 = 4, RESERVED1 = 5, RESERVED2 = 6, RESERVED3 = 7, RESERVED4 = 8, RESERVED5 = 9, RESERVED6 = 10, RESERVED7 = 11, RESERVED8 = 12, RESERVED9 = 13, RESERVED10 = 14, RESERVED11 = 15 } Format; // See # 9.2 typedef enum { CLEANUP = 0x0, FUNC = 0x1, CATCH = 0x2, INVALID = 0x4 } Kind; }; _Unwind_Reason_Code ProcessDescriptors( _Unwind_State state, _Unwind_Control_Block* ucbp, struct _Unwind_Context* context, Descriptor::Format format, const char* descriptorStart, uint32_t flags) { // EHT is inlined in the index using compact form. No descriptors. #5 if (flags & 0x1) return _URC_CONTINUE_UNWIND; // TODO: We should check the state here, and determine whether we need to // perform phase1 or phase2 unwinding. (void)state; const char* descriptor = descriptorStart; uint32_t descriptorWord; getNextWord(descriptor, &descriptorWord); while (descriptorWord) { // Read descriptor based on # 9.2. uint32_t length; uint32_t offset; switch (format) { case Descriptor::LU32: descriptor = getNextWord(descriptor, &length); descriptor = getNextWord(descriptor, &offset); case Descriptor::LU16: descriptor = getNextNibble(descriptor, &length); descriptor = getNextNibble(descriptor, &offset); default: assert(false); return _URC_FAILURE; } // See # 9.2 table for decoding the kind of descriptor. It's a 2-bit value. Descriptor::Kind kind = static_cast((length & 0x1) | ((offset & 0x1) << 1)); // Clear off flag from last bit. length &= ~1u; offset &= ~1u; uintptr_t scopeStart = ucbp->pr_cache.fnstart + offset; uintptr_t scopeEnd = scopeStart + length; uintptr_t pc = _Unwind_GetIP(context); bool isInScope = (scopeStart <= pc) && (pc < scopeEnd); switch (kind) { case Descriptor::CLEANUP: { // TODO(ajwong): Handle cleanup descriptors. break; } case Descriptor::FUNC: { // TODO(ajwong): Handle function descriptors. break; } case Descriptor::CATCH: { // Catch descriptors require gobbling one more word. uint32_t landing_pad; descriptor = getNextWord(descriptor, &landing_pad); if (isInScope) { // TODO(ajwong): This is only phase1 compatible logic. Implement // phase2. landing_pad = signExtendPrel31(landing_pad & ~0x80000000); if (landing_pad == 0xffffffff) { return _URC_HANDLER_FOUND; } else if (landing_pad == 0xfffffffe) { return _URC_FAILURE; } else { /* bool is_reference_type = landing_pad & 0x80000000; void* matched_object; if (__cxxabiv1::__cxa_type_match( ucbp, reinterpret_cast(landing_pad), is_reference_type, &matched_object) != __cxxabiv1::ctm_failed) return _URC_HANDLER_FOUND; */ _LIBUNWIND_ABORT("Type matching not implemented"); } } break; } default: _LIBUNWIND_ABORT("Invalid descriptor kind found."); } getNextWord(descriptor, &descriptorWord); } return _URC_CONTINUE_UNWIND; } static _Unwind_Reason_Code unwindOneFrame(_Unwind_State state, _Unwind_Control_Block* ucbp, struct _Unwind_Context* context) { // Read the compact model EHT entry's header # 6.3 const uint32_t* unwindingData = ucbp->pr_cache.ehtp; assert((*unwindingData & 0xf0000000) == 0x80000000 && "Must be a compact entry"); Descriptor::Format format = static_cast((*unwindingData & 0x0f000000) >> 24); const char *lsda = reinterpret_cast(_Unwind_GetLanguageSpecificData(context)); // Handle descriptors before unwinding so they are processed in the context // of the correct stack frame. _Unwind_Reason_Code result = ProcessDescriptors(state, ucbp, context, format, lsda, ucbp->pr_cache.additional); if (result != _URC_CONTINUE_UNWIND) return result; if (unw_step(reinterpret_cast(context)) != UNW_STEP_SUCCESS) return _URC_FAILURE; return _URC_CONTINUE_UNWIND; } // Generates mask discriminator for _Unwind_VRS_Pop, e.g. for _UVRSC_CORE / // _UVRSD_UINT32. uint32_t RegisterMask(uint8_t start, uint8_t count_minus_one) { return ((1U << (count_minus_one + 1)) - 1) << start; } // Generates mask discriminator for _Unwind_VRS_Pop, e.g. for _UVRSC_VFP / // _UVRSD_DOUBLE. uint32_t RegisterRange(uint8_t start, uint8_t count_minus_one) { return ((uint32_t)start << 16) | ((uint32_t)count_minus_one + 1); } } // end anonymous namespace /** * Decodes an EHT entry. * * @param data Pointer to EHT. * @param[out] off Offset from return value (in bytes) to begin interpretation. * @param[out] len Number of bytes in unwind code. * @return Pointer to beginning of unwind code. */ extern "C" const uint32_t* decode_eht_entry(const uint32_t* data, size_t* off, size_t* len) { if ((*data & 0x80000000) == 0) { // 6.2: Generic Model // // EHT entry is a prel31 pointing to the PR, followed by data understood // only by the personality routine. Fortunately, all existing assembler // implementations, including GNU assembler, LLVM integrated assembler, // and ARM assembler, assume that the unwind opcodes come after the // personality rountine address. *off = 1; // First byte is size data. *len = (((data[1] >> 24) & 0xff) + 1) * 4; data++; // Skip the first word, which is the prel31 offset. } else { // 6.3: ARM Compact Model // // EHT entries here correspond to the __aeabi_unwind_cpp_pr[012] PRs indeded // by format: Descriptor::Format format = static_cast((*data & 0x0f000000) >> 24); switch (format) { case Descriptor::SU16: *len = 4; *off = 1; break; case Descriptor::LU16: case Descriptor::LU32: *len = 4 + 4 * ((*data & 0x00ff0000) >> 16); *off = 2; break; default: return nullptr; } } return data; } _Unwind_Reason_Code _Unwind_VRS_Interpret( _Unwind_Context* context, const uint32_t* data, size_t offset, size_t len) { bool wrotePC = false; bool finish = false; while (offset < len && !finish) { uint8_t byte = getByte(data, offset++); if ((byte & 0x80) == 0) { uint32_t sp; _Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); if (byte & 0x40) sp -= (((uint32_t)byte & 0x3f) << 2) + 4; else sp += ((uint32_t)byte << 2) + 4; _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); } else { switch (byte & 0xf0) { case 0x80: { if (offset >= len) return _URC_FAILURE; uint32_t registers = (((uint32_t)byte & 0x0f) << 12) | (((uint32_t)getByte(data, offset++)) << 4); if (!registers) return _URC_FAILURE; if (registers & (1 << 15)) wrotePC = true; _Unwind_VRS_Pop(context, _UVRSC_CORE, registers, _UVRSD_UINT32); break; } case 0x90: { uint8_t reg = byte & 0x0f; if (reg == 13 || reg == 15) return _URC_FAILURE; uint32_t sp; _Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_R0 + reg, _UVRSD_UINT32, &sp); _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); break; } case 0xa0: { uint32_t registers = RegisterMask(4, byte & 0x07); if (byte & 0x08) registers |= 1 << 14; _Unwind_VRS_Pop(context, _UVRSC_CORE, registers, _UVRSD_UINT32); break; } case 0xb0: { switch (byte) { case 0xb0: finish = true; break; case 0xb1: { if (offset >= len) return _URC_FAILURE; uint8_t registers = getByte(data, offset++); if (registers & 0xf0 || !registers) return _URC_FAILURE; _Unwind_VRS_Pop(context, _UVRSC_CORE, registers, _UVRSD_UINT32); break; } case 0xb2: { uint32_t addend = 0; uint32_t shift = 0; // This decodes a uleb128 value. while (true) { if (offset >= len) return _URC_FAILURE; uint32_t v = getByte(data, offset++); addend |= (v & 0x7f) << shift; if ((v & 0x80) == 0) break; shift += 7; } uint32_t sp; _Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); sp += 0x204 + (addend << 2); _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); break; } case 0xb3: { uint8_t v = getByte(data, offset++); _Unwind_VRS_Pop(context, _UVRSC_VFP, RegisterRange(static_cast(v >> 4), v & 0x0f), _UVRSD_VFPX); break; } case 0xb4: case 0xb5: case 0xb6: case 0xb7: return _URC_FAILURE; default: _Unwind_VRS_Pop(context, _UVRSC_VFP, RegisterRange(8, byte & 0x07), _UVRSD_VFPX); break; } break; } case 0xc0: { switch (byte) { case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4: case 0xc5: _Unwind_VRS_Pop(context, _UVRSC_WMMXD, RegisterRange(10, byte & 0x7), _UVRSD_DOUBLE); break; case 0xc6: { uint8_t v = getByte(data, offset++); uint8_t start = static_cast(v >> 4); uint8_t count_minus_one = v & 0xf; if (start + count_minus_one >= 16) return _URC_FAILURE; _Unwind_VRS_Pop(context, _UVRSC_WMMXD, RegisterRange(start, count_minus_one), _UVRSD_DOUBLE); break; } case 0xc7: { uint8_t v = getByte(data, offset++); if (!v || v & 0xf0) return _URC_FAILURE; _Unwind_VRS_Pop(context, _UVRSC_WMMXC, v, _UVRSD_DOUBLE); break; } case 0xc8: case 0xc9: { uint8_t v = getByte(data, offset++); uint8_t start = static_cast(((byte == 0xc8) ? 16 : 0) + (v >> 4)); uint8_t count_minus_one = v & 0xf; if (start + count_minus_one >= 32) return _URC_FAILURE; _Unwind_VRS_Pop(context, _UVRSC_VFP, RegisterRange(start, count_minus_one), _UVRSD_DOUBLE); break; } default: return _URC_FAILURE; } break; } case 0xd0: { if (byte & 0x08) return _URC_FAILURE; _Unwind_VRS_Pop(context, _UVRSC_VFP, RegisterRange(8, byte & 0x7), _UVRSD_DOUBLE); break; } default: return _URC_FAILURE; } } } if (!wrotePC) { uint32_t lr; _Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_LR, _UVRSD_UINT32, &lr); _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_IP, _UVRSD_UINT32, &lr); } return _URC_CONTINUE_UNWIND; } extern "C" _Unwind_Reason_Code __aeabi_unwind_cpp_pr0( _Unwind_State state, _Unwind_Control_Block *ucbp, _Unwind_Context *context) { return unwindOneFrame(state, ucbp, context); } extern "C" _Unwind_Reason_Code __aeabi_unwind_cpp_pr1( _Unwind_State state, _Unwind_Control_Block *ucbp, _Unwind_Context *context) { return unwindOneFrame(state, ucbp, context); } extern "C" _Unwind_Reason_Code __aeabi_unwind_cpp_pr2( _Unwind_State state, _Unwind_Control_Block *ucbp, _Unwind_Context *context) { return unwindOneFrame(state, ucbp, context); } static _Unwind_Reason_Code unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { // EHABI #7.3 discusses preserving the VRS in a "temporary VRS" during // phase 1 and then restoring it to the "primary VRS" for phase 2. The // effect is phase 2 doesn't see any of the VRS manipulations from phase 1. // In this implementation, the phases don't share the VRS backing store. // Instead, they are passed the original |uc| and they create a new VRS // from scratch thus achieving the same effect. unw_init_local(cursor, uc); // Walk each frame looking for a place to stop. for (bool handlerNotFound = true; handlerNotFound;) { // See if frame has code to run (has personality routine). unw_proc_info_t frameInfo; if (unw_get_proc_info(cursor, &frameInfo) != UNW_ESUCCESS) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): unw_get_proc_info " - "failed => _URC_FATAL_PHASE1_ERROR\n", + "failed => _URC_FATAL_PHASE1_ERROR", static_cast(exception_object)); return _URC_FATAL_PHASE1_ERROR; } // When tracing, print state information. if (_LIBUNWIND_TRACING_UNWINDING) { char functionBuf[512]; const char *functionName = functionBuf; unw_word_t offset; if ((unw_get_proc_name(cursor, functionBuf, sizeof(functionBuf), &offset) != UNW_ESUCCESS) || (frameInfo.start_ip + offset > frameInfo.end_ip)) functionName = ".anonymous."; unw_word_t pc; unw_get_reg(cursor, UNW_REG_IP, &pc); _LIBUNWIND_TRACE_UNWINDING( "unwind_phase1(ex_ojb=%p): pc=0x%llX, start_ip=0x%llX, func=%s, " - "lsda=0x%llX, personality=0x%llX\n", + "lsda=0x%llX, personality=0x%llX", static_cast(exception_object), (long long)pc, (long long)frameInfo.start_ip, functionName, (long long)frameInfo.lsda, (long long)frameInfo.handler); } // If there is a personality routine, ask it if it will want to stop at // this frame. if (frameInfo.handler != 0) { __personality_routine p = (__personality_routine)(long)(frameInfo.handler); _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): calling personality function %p\n", + "unwind_phase1(ex_ojb=%p): calling personality function %p", static_cast(exception_object), reinterpret_cast(reinterpret_cast(p))); struct _Unwind_Context *context = (struct _Unwind_Context *)(cursor); exception_object->pr_cache.fnstart = frameInfo.start_ip; exception_object->pr_cache.ehtp = (_Unwind_EHT_Header *)frameInfo.unwind_info; exception_object->pr_cache.additional = frameInfo.flags; _Unwind_Reason_Code personalityResult = (*p)(_US_VIRTUAL_UNWIND_FRAME, exception_object, context); _LIBUNWIND_TRACE_UNWINDING( "unwind_phase1(ex_ojb=%p): personality result %d start_ip %x ehtp %p " - "additional %x\n", + "additional %x", static_cast(exception_object), personalityResult, exception_object->pr_cache.fnstart, static_cast(exception_object->pr_cache.ehtp), exception_object->pr_cache.additional); switch (personalityResult) { case _URC_HANDLER_FOUND: // found a catch clause or locals that need destructing in this frame // stop search and remember stack pointer at the frame handlerNotFound = false; // p should have initialized barrier_cache. EHABI #7.3.5 _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_HANDLER_FOUND \n", + "unwind_phase1(ex_ojb=%p): _URC_HANDLER_FOUND", static_cast(exception_object)); return _URC_NO_REASON; case _URC_CONTINUE_UNWIND: _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_CONTINUE_UNWIND\n", + "unwind_phase1(ex_ojb=%p): _URC_CONTINUE_UNWIND", static_cast(exception_object)); // continue unwinding break; // EHABI #7.3.3 case _URC_FAILURE: return _URC_FAILURE; default: // something went wrong _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR\n", + "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR", static_cast(exception_object)); return _URC_FATAL_PHASE1_ERROR; } } } return _URC_NO_REASON; } static _Unwind_Reason_Code unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object, bool resume) { // See comment at the start of unwind_phase1 regarding VRS integrity. unw_init_local(cursor, uc); - _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)\n", + _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)", static_cast(exception_object)); int frame_count = 0; // Walk each frame until we reach where search phase said to stop. while (true) { - // Ask libuwind to get next frame (skip over first which is + // Ask libunwind to get next frame (skip over first which is // _Unwind_RaiseException or _Unwind_Resume). // // Resume only ever makes sense for 1 frame. _Unwind_State state = resume ? _US_UNWIND_FRAME_RESUME : _US_UNWIND_FRAME_STARTING; if (resume && frame_count == 1) { // On a resume, first unwind the _Unwind_Resume() frame. The next frame // is now the landing pad for the cleanup from a previous execution of // phase2. To continue unwindingly correctly, replace VRS[15] with the // IP of the frame that the previous run of phase2 installed the context // for. After this, continue unwinding as if normal. // // See #7.4.6 for details. unw_set_reg(cursor, UNW_REG_IP, exception_object->unwinder_cache.reserved2); resume = false; } // Get info about this frame. unw_word_t sp; unw_proc_info_t frameInfo; unw_get_reg(cursor, UNW_REG_SP, &sp); if (unw_get_proc_info(cursor, &frameInfo) != UNW_ESUCCESS) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_get_proc_info " - "failed => _URC_FATAL_PHASE2_ERROR\n", + "failed => _URC_FATAL_PHASE2_ERROR", static_cast(exception_object)); return _URC_FATAL_PHASE2_ERROR; } // When tracing, print state information. if (_LIBUNWIND_TRACING_UNWINDING) { char functionBuf[512]; const char *functionName = functionBuf; unw_word_t offset; if ((unw_get_proc_name(cursor, functionBuf, sizeof(functionBuf), &offset) != UNW_ESUCCESS) || (frameInfo.start_ip + offset > frameInfo.end_ip)) functionName = ".anonymous."; _LIBUNWIND_TRACE_UNWINDING( "unwind_phase2(ex_ojb=%p): start_ip=0x%llX, func=%s, sp=0x%llX, " - "lsda=0x%llX, personality=0x%llX\n", + "lsda=0x%llX, personality=0x%llX", static_cast(exception_object), (long long)frameInfo.start_ip, functionName, (long long)sp, (long long)frameInfo.lsda, (long long)frameInfo.handler); } // If there is a personality routine, tell it we are unwinding. if (frameInfo.handler != 0) { __personality_routine p = (__personality_routine)(long)(frameInfo.handler); struct _Unwind_Context *context = (struct _Unwind_Context *)(cursor); // EHABI #7.2 exception_object->pr_cache.fnstart = frameInfo.start_ip; exception_object->pr_cache.ehtp = (_Unwind_EHT_Header *)frameInfo.unwind_info; exception_object->pr_cache.additional = frameInfo.flags; _Unwind_Reason_Code personalityResult = (*p)(state, exception_object, context); switch (personalityResult) { case _URC_CONTINUE_UNWIND: // Continue unwinding _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND\n", + "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND", static_cast(exception_object)); // EHABI #7.2 if (sp == exception_object->barrier_cache.sp) { // Phase 1 said we would stop at this frame, but we did not... _LIBUNWIND_ABORT("during phase1 personality function said it would " "stop here, but now in phase2 it did not stop here"); } break; case _URC_INSTALL_CONTEXT: _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2(ex_ojb=%p): _URC_INSTALL_CONTEXT\n", + "unwind_phase2(ex_ojb=%p): _URC_INSTALL_CONTEXT", static_cast(exception_object)); // Personality routine says to transfer control to landing pad. // We may get control back if landing pad calls _Unwind_Resume(). if (_LIBUNWIND_TRACING_UNWINDING) { unw_word_t pc; unw_get_reg(cursor, UNW_REG_IP, &pc); unw_get_reg(cursor, UNW_REG_SP, &sp); _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): re-entering " - "user code with ip=0x%llX, sp=0x%llX\n", + "user code with ip=0x%llX, sp=0x%llX", static_cast(exception_object), (long long)pc, (long long)sp); } { // EHABI #7.4.1 says we need to preserve pc for when _Unwind_Resume // is called back, to find this same frame. unw_word_t pc; unw_get_reg(cursor, UNW_REG_IP, &pc); exception_object->unwinder_cache.reserved2 = (uint32_t)pc; } unw_resume(cursor); // unw_resume() only returns if there was an error. return _URC_FATAL_PHASE2_ERROR; // # EHABI #7.4.3 case _URC_FAILURE: abort(); default: // Personality routine returned an unknown result code. _LIBUNWIND_DEBUG_LOG("personality function returned unknown result %d", personalityResult); return _URC_FATAL_PHASE2_ERROR; } } frame_count++; } // Clean up phase did not resume at the frame that the search phase // said it would... return _URC_FATAL_PHASE2_ERROR; } /// Called by __cxa_throw. Only returns if there is a fatal error. _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_RaiseException(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_RaiseException(ex_obj=%p)", static_cast(exception_object)); unw_context_t uc; unw_cursor_t cursor; unw_getcontext(&uc); // This field for is for compatibility with GCC to say this isn't a forced // unwind. EHABI #7.2 exception_object->unwinder_cache.reserved1 = 0; // phase 1: the search phase _Unwind_Reason_Code phase1 = unwind_phase1(&uc, &cursor, exception_object); if (phase1 != _URC_NO_REASON) return phase1; // phase 2: the clean up phase return unwind_phase2(&uc, &cursor, exception_object, false); } _LIBUNWIND_EXPORT void _Unwind_Complete(_Unwind_Exception* exception_object) { // This is to be called when exception handling completes to give us a chance // to perform any housekeeping. EHABI #7.2. But we have nothing to do here. (void)exception_object; } /// When _Unwind_RaiseException() is in phase2, it hands control /// to the personality function at each frame. The personality /// may force a jump to a landing pad in that function, the landing /// pad code may then call _Unwind_Resume() to continue with the /// unwinding. Note: the call to _Unwind_Resume() is from compiler /// geneated user code. All other _Unwind_* routines are called /// by the C++ runtime __cxa_* routines. /// /// Note: re-throwing an exception (as opposed to continuing the unwind) /// is implemented by having the code call __cxa_rethrow() which /// in turn calls _Unwind_Resume_or_Rethrow(). _LIBUNWIND_EXPORT void _Unwind_Resume(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_Resume(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_Resume(ex_obj=%p)", static_cast(exception_object)); unw_context_t uc; unw_cursor_t cursor; unw_getcontext(&uc); // _Unwind_RaiseException on EHABI will always set the reserved1 field to 0, // which is in the same position as private_1 below. // TODO(ajwong): Who wronte the above? Why is it true? unwind_phase2(&uc, &cursor, exception_object, true); // Clients assume _Unwind_Resume() does not return, so all we can do is abort. _LIBUNWIND_ABORT("_Unwind_Resume() can't return"); } /// Called by personality handler during phase 2 to get LSDA for current frame. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_proc_info_t frameInfo; uintptr_t result = 0; if (unw_get_proc_info(cursor, &frameInfo) == UNW_ESUCCESS) result = (uintptr_t)frameInfo.lsda; _LIBUNWIND_TRACE_API( - "_Unwind_GetLanguageSpecificData(context=%p) => 0x%llx\n", + "_Unwind_GetLanguageSpecificData(context=%p) => 0x%llx", static_cast(context), (long long)result); return result; } static uint64_t ValueAsBitPattern(_Unwind_VRS_DataRepresentation representation, void* valuep) { uint64_t value = 0; switch (representation) { case _UVRSD_UINT32: case _UVRSD_FLOAT: memcpy(&value, valuep, sizeof(uint32_t)); break; case _UVRSD_VFPX: case _UVRSD_UINT64: case _UVRSD_DOUBLE: memcpy(&value, valuep, sizeof(uint64_t)); break; } return value; } _Unwind_VRS_Result _Unwind_VRS_Set(_Unwind_Context *context, _Unwind_VRS_RegClass regclass, uint32_t regno, _Unwind_VRS_DataRepresentation representation, void *valuep) { _LIBUNWIND_TRACE_API("_Unwind_VRS_Set(context=%p, regclass=%d, reg=%d, " - "rep=%d, value=0x%llX)\n", + "rep=%d, value=0x%llX)", static_cast(context), regclass, regno, representation, ValueAsBitPattern(representation, valuep)); unw_cursor_t *cursor = (unw_cursor_t *)context; switch (regclass) { case _UVRSC_CORE: if (representation != _UVRSD_UINT32 || regno > 15) return _UVRSR_FAILED; return unw_set_reg(cursor, (unw_regnum_t)(UNW_ARM_R0 + regno), *(unw_word_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_WMMXC: if (representation != _UVRSD_UINT32 || regno > 3) return _UVRSR_FAILED; return unw_set_reg(cursor, (unw_regnum_t)(UNW_ARM_WC0 + regno), *(unw_word_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_VFP: if (representation != _UVRSD_VFPX && representation != _UVRSD_DOUBLE) return _UVRSR_FAILED; if (representation == _UVRSD_VFPX) { // Can only touch d0-15 with FSTMFDX. if (regno > 15) return _UVRSR_FAILED; unw_save_vfp_as_X(cursor); } else { if (regno > 31) return _UVRSR_FAILED; } return unw_set_fpreg(cursor, (unw_regnum_t)(UNW_ARM_D0 + regno), *(unw_fpreg_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_WMMXD: if (representation != _UVRSD_DOUBLE || regno > 31) return _UVRSR_FAILED; return unw_set_fpreg(cursor, (unw_regnum_t)(UNW_ARM_WR0 + regno), *(unw_fpreg_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; } _LIBUNWIND_ABORT("unsupported register class"); } static _Unwind_VRS_Result _Unwind_VRS_Get_Internal(_Unwind_Context *context, _Unwind_VRS_RegClass regclass, uint32_t regno, _Unwind_VRS_DataRepresentation representation, void *valuep) { unw_cursor_t *cursor = (unw_cursor_t *)context; switch (regclass) { case _UVRSC_CORE: if (representation != _UVRSD_UINT32 || regno > 15) return _UVRSR_FAILED; return unw_get_reg(cursor, (unw_regnum_t)(UNW_ARM_R0 + regno), (unw_word_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_WMMXC: if (representation != _UVRSD_UINT32 || regno > 3) return _UVRSR_FAILED; return unw_get_reg(cursor, (unw_regnum_t)(UNW_ARM_WC0 + regno), (unw_word_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_VFP: if (representation != _UVRSD_VFPX && representation != _UVRSD_DOUBLE) return _UVRSR_FAILED; if (representation == _UVRSD_VFPX) { // Can only touch d0-15 with FSTMFDX. if (regno > 15) return _UVRSR_FAILED; unw_save_vfp_as_X(cursor); } else { if (regno > 31) return _UVRSR_FAILED; } return unw_get_fpreg(cursor, (unw_regnum_t)(UNW_ARM_D0 + regno), (unw_fpreg_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; case _UVRSC_WMMXD: if (representation != _UVRSD_DOUBLE || regno > 31) return _UVRSR_FAILED; return unw_get_fpreg(cursor, (unw_regnum_t)(UNW_ARM_WR0 + regno), (unw_fpreg_t *)valuep) == UNW_ESUCCESS ? _UVRSR_OK : _UVRSR_FAILED; } _LIBUNWIND_ABORT("unsupported register class"); } _Unwind_VRS_Result _Unwind_VRS_Get( _Unwind_Context *context, _Unwind_VRS_RegClass regclass, uint32_t regno, _Unwind_VRS_DataRepresentation representation, void *valuep) { _Unwind_VRS_Result result = _Unwind_VRS_Get_Internal(context, regclass, regno, representation, valuep); _LIBUNWIND_TRACE_API("_Unwind_VRS_Get(context=%p, regclass=%d, reg=%d, " - "rep=%d, value=0x%llX, result = %d)\n", + "rep=%d, value=0x%llX, result = %d)", static_cast(context), regclass, regno, representation, ValueAsBitPattern(representation, valuep), result); return result; } _Unwind_VRS_Result _Unwind_VRS_Pop(_Unwind_Context *context, _Unwind_VRS_RegClass regclass, uint32_t discriminator, _Unwind_VRS_DataRepresentation representation) { _LIBUNWIND_TRACE_API("_Unwind_VRS_Pop(context=%p, regclass=%d, " - "discriminator=%d, representation=%d)\n", + "discriminator=%d, representation=%d)", static_cast(context), regclass, discriminator, representation); switch (regclass) { case _UVRSC_CORE: case _UVRSC_WMMXC: { if (representation != _UVRSD_UINT32) return _UVRSR_FAILED; // When popping SP from the stack, we don't want to override it from the // computed new stack location. See EHABI #7.5.4 table 3. bool poppedSP = false; uint32_t* sp; if (_Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp) != _UVRSR_OK) { return _UVRSR_FAILED; } for (uint32_t i = 0; i < 16; ++i) { if (!(discriminator & static_cast(1 << i))) continue; uint32_t value = *sp++; if (regclass == _UVRSC_CORE && i == 13) poppedSP = true; if (_Unwind_VRS_Set(context, regclass, i, _UVRSD_UINT32, &value) != _UVRSR_OK) { return _UVRSR_FAILED; } } if (!poppedSP) { return _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); } return _UVRSR_OK; } case _UVRSC_VFP: case _UVRSC_WMMXD: { if (representation != _UVRSD_VFPX && representation != _UVRSD_DOUBLE) return _UVRSR_FAILED; uint32_t first = discriminator >> 16; uint32_t count = discriminator & 0xffff; uint32_t end = first+count; uint32_t* sp; if (_Unwind_VRS_Get(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp) != _UVRSR_OK) { return _UVRSR_FAILED; } // For _UVRSD_VFPX, we're assuming the data is stored in FSTMX "standard // format 1", which is equivalent to FSTMD + a padding word. for (uint32_t i = first; i < end; ++i) { // SP is only 32-bit aligned so don't copy 64-bit at a time. uint64_t value = *sp++; value |= ((uint64_t)(*sp++)) << 32; if (_Unwind_VRS_Set(context, regclass, i, representation, &value) != _UVRSR_OK) return _UVRSR_FAILED; } if (representation == _UVRSD_VFPX) ++sp; return _Unwind_VRS_Set(context, _UVRSC_CORE, UNW_ARM_SP, _UVRSD_UINT32, &sp); } } _LIBUNWIND_ABORT("unsupported register class"); } /// Called by personality handler during phase 2 to find the start of the /// function. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetRegionStart(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_proc_info_t frameInfo; uintptr_t result = 0; if (unw_get_proc_info(cursor, &frameInfo) == UNW_ESUCCESS) result = (uintptr_t)frameInfo.start_ip; - _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p) => 0x%llX\n", + _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p) => 0x%llX", static_cast(context), (long long)result); return result; } /// Called by personality handler during phase 2 if a foreign exception // is caught. _LIBUNWIND_EXPORT void _Unwind_DeleteException(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)", static_cast(exception_object)); if (exception_object->exception_cleanup != NULL) (*exception_object->exception_cleanup)(_URC_FOREIGN_EXCEPTION_CAUGHT, exception_object); } extern "C" _LIBUNWIND_EXPORT _Unwind_Reason_Code __gnu_unwind_frame(_Unwind_Exception *exception_object, struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; if (unw_step(cursor) != UNW_STEP_SUCCESS) return _URC_FAILURE; return _URC_OK; } #endif // _LIBUNWIND_ARM_EHABI Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-sjlj.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-sjlj.c (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/Unwind-sjlj.c (revision 308054) @@ -1,468 +1,468 @@ //===--------------------------- Unwind-sjlj.c ----------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Implements setjump-longjump based C++ exceptions // //===----------------------------------------------------------------------===// #include #include #include #include #include "config.h" #include "unwind_ext.h" // // 32-bit iOS uses setjump/longjump based C++ exceptions. // Other architectures use "zero cost" exceptions. // // With SJLJ based exceptions, any function that has a catch clause or needs to // do any clean up when an exception propagates through it, needs to call // _Unwind_SjLj_Register() at the start of the function and // _Unwind_SjLj_Unregister() at the end. The register function is called with // the address of a block of memory in the function's stack frame. The runtime // keeps a linked list (stack) of these blocks - one per thread. The calling // function also sets the personality and lsda fields of the block. // #if _LIBUNWIND_BUILD_SJLJ_APIS struct _Unwind_FunctionContext { // next function in stack of handlers struct _Unwind_FunctionContext *prev; // set by calling function before registering to be the landing pad uintptr_t resumeLocation; // set by personality handler to be parameters passed to landing pad function uintptr_t resumeParameters[4]; // set by calling function before registering __personality_routine personality; // arm offset=24 uintptr_t lsda; // arm offset=28 // variable length array, contains registers to restore // 0 = r7, 1 = pc, 2 = sp void *jbuf[]; }; /// Called at start of each function that catches exceptions _LIBUNWIND_EXPORT void _Unwind_SjLj_Register(struct _Unwind_FunctionContext *fc) { fc->prev = __Unwind_SjLj_GetTopOfFunctionStack(); __Unwind_SjLj_SetTopOfFunctionStack(fc); } /// Called at end of each function that catches exceptions _LIBUNWIND_EXPORT void _Unwind_SjLj_Unregister(struct _Unwind_FunctionContext *fc) { __Unwind_SjLj_SetTopOfFunctionStack(fc->prev); } static _Unwind_Reason_Code unwind_phase1(struct _Unwind_Exception *exception_object) { _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack(); - _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: initial function-context=%p\n", c); + _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: initial function-context=%p", c); // walk each frame looking for a place to stop for (bool handlerNotFound = true; handlerNotFound; c = c->prev) { // check for no more frames if (c == NULL) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): reached " - "bottom => _URC_END_OF_STACK\n", + "bottom => _URC_END_OF_STACK", exception_object); return _URC_END_OF_STACK; } - _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: function-context=%p\n", c); + _LIBUNWIND_TRACE_UNWINDING("unwind_phase1: function-context=%p", c); // if there is a personality routine, ask it if it will want to stop at this // frame if (c->personality != NULL) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): calling " - "personality function %p\n", + "personality function %p", exception_object, c->personality); _Unwind_Reason_Code personalityResult = (*c->personality)( 1, _UA_SEARCH_PHASE, exception_object->exception_class, exception_object, (struct _Unwind_Context *)c); switch (personalityResult) { case _URC_HANDLER_FOUND: // found a catch clause or locals that need destructing in this frame // stop search and remember function context handlerNotFound = false; exception_object->private_2 = (uintptr_t) c; _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): " - "_URC_HANDLER_FOUND\n", exception_object); + "_URC_HANDLER_FOUND", exception_object); return _URC_NO_REASON; case _URC_CONTINUE_UNWIND: _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): " - "_URC_CONTINUE_UNWIND\n", exception_object); + "_URC_CONTINUE_UNWIND", exception_object); // continue unwinding break; default: // something went wrong _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR\n", + "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR", exception_object); return _URC_FATAL_PHASE1_ERROR; } } } return _URC_NO_REASON; } static _Unwind_Reason_Code unwind_phase2(struct _Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)\n", exception_object); + _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)", exception_object); // walk each frame until we reach where search phase said to stop _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack(); while (true) { - _LIBUNWIND_TRACE_UNWINDING("unwind_phase2s(ex_ojb=%p): context=%p\n", + _LIBUNWIND_TRACE_UNWINDING("unwind_phase2s(ex_ojb=%p): context=%p", exception_object, c); // check for no more frames if (c == NULL) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_step() reached " - "bottom => _URC_END_OF_STACK\n", + "bottom => _URC_END_OF_STACK", exception_object); return _URC_END_OF_STACK; } // if there is a personality routine, tell it we are unwinding if (c->personality != NULL) { _Unwind_Action action = _UA_CLEANUP_PHASE; if ((uintptr_t) c == exception_object->private_2) action = (_Unwind_Action)( _UA_CLEANUP_PHASE | _UA_HANDLER_FRAME); // tell personality this was the frame it marked // in phase 1 _Unwind_Reason_Code personalityResult = (*c->personality)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)c); switch (personalityResult) { case _URC_CONTINUE_UNWIND: // continue unwinding _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND\n", + "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND", exception_object); if ((uintptr_t) c == exception_object->private_2) { // phase 1 said we would stop at this frame, but we did not... _LIBUNWIND_ABORT("during phase1 personality function said it would " "stop here, but now if phase2 it did not stop here"); } break; case _URC_INSTALL_CONTEXT: _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): " "_URC_INSTALL_CONTEXT, will resume at " - "landing pad %p\n", + "landing pad %p", exception_object, c->jbuf[1]); // personality routine says to transfer control to landing pad // we may get control back if landing pad calls _Unwind_Resume() __Unwind_SjLj_SetTopOfFunctionStack(c); __builtin_longjmp(c->jbuf, 1); // unw_resume() only returns if there was an error return _URC_FATAL_PHASE2_ERROR; default: // something went wrong _LIBUNWIND_DEBUG_LOG("personality function returned unknown result %d", personalityResult); return _URC_FATAL_PHASE2_ERROR; } } c = c->prev; } // clean up phase did not resume at the frame that the search phase said it // would return _URC_FATAL_PHASE2_ERROR; } static _Unwind_Reason_Code unwind_phase2_forced(struct _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop, void *stop_parameter) { // walk each frame until we reach where search phase said to stop _Unwind_FunctionContext_t c = __Unwind_SjLj_GetTopOfFunctionStack(); while (true) { // get next frame (skip over first which is _Unwind_RaiseException) if (c == NULL) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_step() reached " - "bottom => _URC_END_OF_STACK\n", + "bottom => _URC_END_OF_STACK", exception_object); return _URC_END_OF_STACK; } // call stop function at each frame _Unwind_Action action = (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE); _Unwind_Reason_Code stopResult = (*stop)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)c, stop_parameter); _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " - "stop function returned %d\n", + "stop function returned %d", exception_object, stopResult); if (stopResult != _URC_NO_REASON) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " - "stopped by stop function\n", + "stopped by stop function", exception_object); return _URC_FATAL_PHASE2_ERROR; } // if there is a personality routine, tell it we are unwinding if (c->personality != NULL) { __personality_routine p = (__personality_routine) c->personality; _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " - "calling personality function %p\n", + "calling personality function %p", exception_object, p); _Unwind_Reason_Code personalityResult = (*p)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)c); switch (personalityResult) { case _URC_CONTINUE_UNWIND: _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " - "personality returned _URC_CONTINUE_UNWIND\n", + "personality returned _URC_CONTINUE_UNWIND", exception_object); // destructors called, continue unwinding break; case _URC_INSTALL_CONTEXT: _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " - "personality returned _URC_INSTALL_CONTEXT\n", + "personality returned _URC_INSTALL_CONTEXT", exception_object); // we may get control back if landing pad calls _Unwind_Resume() __Unwind_SjLj_SetTopOfFunctionStack(c); __builtin_longjmp(c->jbuf, 1); break; default: // something went wrong _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " "personality returned %d, " - "_URC_FATAL_PHASE2_ERROR\n", + "_URC_FATAL_PHASE2_ERROR", exception_object, personalityResult); return _URC_FATAL_PHASE2_ERROR; } } c = c->prev; } // call stop function one last time and tell it we've reached the end of the // stack _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): calling stop " - "function with _UA_END_OF_STACK\n", + "function with _UA_END_OF_STACK", exception_object); _Unwind_Action lastAction = (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE | _UA_END_OF_STACK); (*stop)(1, lastAction, exception_object->exception_class, exception_object, (struct _Unwind_Context *)c, stop_parameter); // clean up phase did not resume at the frame that the search phase said it // would return _URC_FATAL_PHASE2_ERROR; } /// Called by __cxa_throw. Only returns if there is a fatal error _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_SjLj_RaiseException(ex_obj=%p)\n", exception_object); + _LIBUNWIND_TRACE_API("_Unwind_SjLj_RaiseException(ex_obj=%p)", exception_object); // mark that this is a non-forced unwind, so _Unwind_Resume() can do the right // thing exception_object->private_1 = 0; exception_object->private_2 = 0; // phase 1: the search phase _Unwind_Reason_Code phase1 = unwind_phase1(exception_object); if (phase1 != _URC_NO_REASON) return phase1; // phase 2: the clean up phase return unwind_phase2(exception_object); } /// When _Unwind_RaiseException() is in phase2, it hands control /// to the personality function at each frame. The personality /// may force a jump to a landing pad in that function, the landing /// pad code may then call _Unwind_Resume() to continue with the /// unwinding. Note: the call to _Unwind_Resume() is from compiler /// geneated user code. All other _Unwind_* routines are called /// by the C++ runtime __cxa_* routines. /// /// Re-throwing an exception is implemented by having the code call /// __cxa_rethrow() which in turn calls _Unwind_Resume_or_Rethrow() _LIBUNWIND_EXPORT void _Unwind_SjLj_Resume(struct _Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_SjLj_Resume(ex_obj=%p)\n", exception_object); + _LIBUNWIND_TRACE_API("_Unwind_SjLj_Resume(ex_obj=%p)", exception_object); if (exception_object->private_1 != 0) unwind_phase2_forced(exception_object, (_Unwind_Stop_Fn) exception_object->private_1, (void *)exception_object->private_2); else unwind_phase2(exception_object); // clients assume _Unwind_Resume() does not return, so all we can do is abort. _LIBUNWIND_ABORT("_Unwind_SjLj_Resume() can't return"); } /// Called by __cxa_rethrow(). _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *exception_object) { _LIBUNWIND_TRACE_API("__Unwind_SjLj_Resume_or_Rethrow(ex_obj=%p), " - "private_1=%ld\n", + "private_1=%ld", exception_object, exception_object->private_1); // If this is non-forced and a stopping place was found, then this is a // re-throw. // Call _Unwind_RaiseException() as if this was a new exception. if (exception_object->private_1 == 0) { return _Unwind_SjLj_RaiseException(exception_object); // should return if there is no catch clause, so that __cxa_rethrow can call // std::terminate() } // Call through to _Unwind_Resume() which distiguishes between forced and // regular exceptions. _Unwind_SjLj_Resume(exception_object); _LIBUNWIND_ABORT("__Unwind_SjLj_Resume_or_Rethrow() called " "_Unwind_SjLj_Resume() which unexpectedly returned"); } /// Called by personality handler during phase 2 to get LSDA for current frame. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) { _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; _LIBUNWIND_TRACE_API("_Unwind_GetLanguageSpecificData(context=%p) " - "=> 0x%0lX\n", context, ufc->lsda); + "=> 0x%0lX", context, ufc->lsda); return ufc->lsda; } /// Called by personality handler during phase 2 to get register values. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetGR(struct _Unwind_Context *context, int index) { - _LIBUNWIND_TRACE_API("_Unwind_GetGR(context=%p, reg=%d)\n", + _LIBUNWIND_TRACE_API("_Unwind_GetGR(context=%p, reg=%d)", context, index); _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; return ufc->resumeParameters[index]; } /// Called by personality handler during phase 2 to alter register values. _LIBUNWIND_EXPORT void _Unwind_SetGR(struct _Unwind_Context *context, int index, uintptr_t new_value) { - _LIBUNWIND_TRACE_API("_Unwind_SetGR(context=%p, reg=%d, value=0x%0lX)\n" + _LIBUNWIND_TRACE_API("_Unwind_SetGR(context=%p, reg=%d, value=0x%0lX)" , context, index, new_value); _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; ufc->resumeParameters[index] = new_value; } /// Called by personality handler during phase 2 to get instruction pointer. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) { _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; - _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%lX\n", context, + _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%lX", context, ufc->resumeLocation + 1); return ufc->resumeLocation + 1; } /// Called by personality handler during phase 2 to get instruction pointer. /// ipBefore is a boolean that says if IP is already adjusted to be the call /// site address. Normally IP is the return address. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIPInfo(struct _Unwind_Context *context, int *ipBefore) { _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; *ipBefore = 0; - _LIBUNWIND_TRACE_API("_Unwind_GetIPInfo(context=%p, %p) => 0x%lX\n", + _LIBUNWIND_TRACE_API("_Unwind_GetIPInfo(context=%p, %p) => 0x%lX", context, ipBefore, ufc->resumeLocation + 1); return ufc->resumeLocation + 1; } /// Called by personality handler during phase 2 to alter instruction pointer. _LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *context, uintptr_t new_value) { - _LIBUNWIND_TRACE_API("_Unwind_SetIP(context=%p, value=0x%0lX)\n", + _LIBUNWIND_TRACE_API("_Unwind_SetIP(context=%p, value=0x%0lX)", context, new_value); _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; ufc->resumeLocation = new_value - 1; } /// Called by personality handler during phase 2 to find the start of the /// function. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetRegionStart(struct _Unwind_Context *context) { // Not supported or needed for sjlj based unwinding (void)context; - _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p)\n", context); + _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p)", context); return 0; } /// Called by personality handler during phase 2 if a foreign exception /// is caught. _LIBUNWIND_EXPORT void _Unwind_DeleteException(struct _Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)", exception_object); if (exception_object->exception_cleanup != NULL) (*exception_object->exception_cleanup)(_URC_FOREIGN_EXCEPTION_CAUGHT, exception_object); } /// Called by personality handler during phase 2 to get base address for data /// relative encodings. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetDataRelBase(struct _Unwind_Context *context) { // Not supported or needed for sjlj based unwinding (void)context; - _LIBUNWIND_TRACE_API("_Unwind_GetDataRelBase(context=%p)\n", context); + _LIBUNWIND_TRACE_API("_Unwind_GetDataRelBase(context=%p)", context); _LIBUNWIND_ABORT("_Unwind_GetDataRelBase() not implemented"); } /// Called by personality handler during phase 2 to get base address for text /// relative encodings. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetTextRelBase(struct _Unwind_Context *context) { // Not supported or needed for sjlj based unwinding (void)context; - _LIBUNWIND_TRACE_API("_Unwind_GetTextRelBase(context=%p)\n", context); + _LIBUNWIND_TRACE_API("_Unwind_GetTextRelBase(context=%p)", context); _LIBUNWIND_ABORT("_Unwind_GetTextRelBase() not implemented"); } /// Called by personality handler to get "Call Frame Area" for current frame. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetCFA(struct _Unwind_Context *context) { - _LIBUNWIND_TRACE_API("_Unwind_GetCFA(context=%p)\n", context); + _LIBUNWIND_TRACE_API("_Unwind_GetCFA(context=%p)", context); if (context != NULL) { _Unwind_FunctionContext_t ufc = (_Unwind_FunctionContext_t) context; // Setjmp/longjmp based exceptions don't have a true CFA. // Instead, the SP in the jmpbuf is the closest approximation. return (uintptr_t) ufc->jbuf[2]; } return 0; } #endif // _LIBUNWIND_BUILD_SJLJ_APIS Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindCursor.hpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindCursor.hpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindCursor.hpp (revision 308054) @@ -1,1372 +1,1372 @@ //===------------------------- UnwindCursor.hpp ---------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // -// C++ interface to lower levels of libuwind +// C++ interface to lower levels of libunwind //===----------------------------------------------------------------------===// #ifndef __UNWINDCURSOR_HPP__ #define __UNWINDCURSOR_HPP__ #include #include #include #include #include #include #ifdef __APPLE__ #include #endif #include "config.h" #include "AddressSpace.hpp" #include "CompactUnwinder.hpp" #include "config.h" #include "DwarfInstructions.hpp" #include "EHHeaderParser.hpp" #include "libunwind.h" #include "Registers.hpp" #include "Unwind-EHABI.h" namespace libunwind { #if _LIBUNWIND_SUPPORT_DWARF_UNWIND /// Cache of recently found FDEs. template class _LIBUNWIND_HIDDEN DwarfFDECache { typedef typename A::pint_t pint_t; public: static pint_t findFDE(pint_t mh, pint_t pc); static void add(pint_t mh, pint_t ip_start, pint_t ip_end, pint_t fde); static void removeAllIn(pint_t mh); static void iterateCacheEntries(void (*func)(unw_word_t ip_start, unw_word_t ip_end, unw_word_t fde, unw_word_t mh)); private: struct entry { pint_t mh; pint_t ip_start; pint_t ip_end; pint_t fde; }; // These fields are all static to avoid needing an initializer. // There is only one instance of this class per process. static pthread_rwlock_t _lock; #ifdef __APPLE__ static void dyldUnloadHook(const struct mach_header *mh, intptr_t slide); static bool _registeredForDyldUnloads; #endif // Can't use std::vector<> here because this code is below libc++. static entry *_buffer; static entry *_bufferUsed; static entry *_bufferEnd; static entry _initialBuffer[64]; }; template typename DwarfFDECache::entry * DwarfFDECache::_buffer = _initialBuffer; template typename DwarfFDECache::entry * DwarfFDECache::_bufferUsed = _initialBuffer; template typename DwarfFDECache::entry * DwarfFDECache::_bufferEnd = &_initialBuffer[64]; template typename DwarfFDECache::entry DwarfFDECache::_initialBuffer[64]; template pthread_rwlock_t DwarfFDECache::_lock = PTHREAD_RWLOCK_INITIALIZER; #ifdef __APPLE__ template bool DwarfFDECache::_registeredForDyldUnloads = false; #endif template typename A::pint_t DwarfFDECache::findFDE(pint_t mh, pint_t pc) { pint_t result = 0; _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_rdlock(&_lock)); for (entry *p = _buffer; p < _bufferUsed; ++p) { if ((mh == p->mh) || (mh == 0)) { if ((p->ip_start <= pc) && (pc < p->ip_end)) { result = p->fde; break; } } } _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_unlock(&_lock)); return result; } template void DwarfFDECache::add(pint_t mh, pint_t ip_start, pint_t ip_end, pint_t fde) { #if !defined(_LIBUNWIND_NO_HEAP) _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_wrlock(&_lock)); if (_bufferUsed >= _bufferEnd) { size_t oldSize = (size_t)(_bufferEnd - _buffer); size_t newSize = oldSize * 4; // Can't use operator new (we are below it). entry *newBuffer = (entry *)malloc(newSize * sizeof(entry)); memcpy(newBuffer, _buffer, oldSize * sizeof(entry)); if (_buffer != _initialBuffer) free(_buffer); _buffer = newBuffer; _bufferUsed = &newBuffer[oldSize]; _bufferEnd = &newBuffer[newSize]; } _bufferUsed->mh = mh; _bufferUsed->ip_start = ip_start; _bufferUsed->ip_end = ip_end; _bufferUsed->fde = fde; ++_bufferUsed; #ifdef __APPLE__ if (!_registeredForDyldUnloads) { _dyld_register_func_for_remove_image(&dyldUnloadHook); _registeredForDyldUnloads = true; } #endif _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_unlock(&_lock)); #endif } template void DwarfFDECache::removeAllIn(pint_t mh) { _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_wrlock(&_lock)); entry *d = _buffer; for (const entry *s = _buffer; s < _bufferUsed; ++s) { if (s->mh != mh) { if (d != s) *d = *s; ++d; } } _bufferUsed = d; _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_unlock(&_lock)); } #ifdef __APPLE__ template void DwarfFDECache::dyldUnloadHook(const struct mach_header *mh, intptr_t ) { removeAllIn((pint_t) mh); } #endif template void DwarfFDECache::iterateCacheEntries(void (*func)( unw_word_t ip_start, unw_word_t ip_end, unw_word_t fde, unw_word_t mh)) { _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_wrlock(&_lock)); for (entry *p = _buffer; p < _bufferUsed; ++p) { (*func)(p->ip_start, p->ip_end, p->fde, p->mh); } _LIBUNWIND_LOG_NON_ZERO(::pthread_rwlock_unlock(&_lock)); } #endif // _LIBUNWIND_SUPPORT_DWARF_UNWIND #define arrayoffsetof(type, index, field) ((size_t)(&((type *)0)[index].field)) #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND template class UnwindSectionHeader { public: UnwindSectionHeader(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t version() const { return _addressSpace.get32(_addr + offsetof(unwind_info_section_header, version)); } uint32_t commonEncodingsArraySectionOffset() const { return _addressSpace.get32(_addr + offsetof(unwind_info_section_header, commonEncodingsArraySectionOffset)); } uint32_t commonEncodingsArrayCount() const { return _addressSpace.get32(_addr + offsetof(unwind_info_section_header, commonEncodingsArrayCount)); } uint32_t personalityArraySectionOffset() const { return _addressSpace.get32(_addr + offsetof(unwind_info_section_header, personalityArraySectionOffset)); } uint32_t personalityArrayCount() const { return _addressSpace.get32( _addr + offsetof(unwind_info_section_header, personalityArrayCount)); } uint32_t indexSectionOffset() const { return _addressSpace.get32( _addr + offsetof(unwind_info_section_header, indexSectionOffset)); } uint32_t indexCount() const { return _addressSpace.get32( _addr + offsetof(unwind_info_section_header, indexCount)); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionIndexArray { public: UnwindSectionIndexArray(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t functionOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_section_header_index_entry, index, functionOffset)); } uint32_t secondLevelPagesSectionOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_section_header_index_entry, index, secondLevelPagesSectionOffset)); } uint32_t lsdaIndexArraySectionOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_section_header_index_entry, index, lsdaIndexArraySectionOffset)); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionRegularPageHeader { public: UnwindSectionRegularPageHeader(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t kind() const { return _addressSpace.get32( _addr + offsetof(unwind_info_regular_second_level_page_header, kind)); } uint16_t entryPageOffset() const { return _addressSpace.get16( _addr + offsetof(unwind_info_regular_second_level_page_header, entryPageOffset)); } uint16_t entryCount() const { return _addressSpace.get16( _addr + offsetof(unwind_info_regular_second_level_page_header, entryCount)); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionRegularArray { public: UnwindSectionRegularArray(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t functionOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_regular_second_level_entry, index, functionOffset)); } uint32_t encoding(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_regular_second_level_entry, index, encoding)); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionCompressedPageHeader { public: UnwindSectionCompressedPageHeader(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t kind() const { return _addressSpace.get32( _addr + offsetof(unwind_info_compressed_second_level_page_header, kind)); } uint16_t entryPageOffset() const { return _addressSpace.get16( _addr + offsetof(unwind_info_compressed_second_level_page_header, entryPageOffset)); } uint16_t entryCount() const { return _addressSpace.get16( _addr + offsetof(unwind_info_compressed_second_level_page_header, entryCount)); } uint16_t encodingsPageOffset() const { return _addressSpace.get16( _addr + offsetof(unwind_info_compressed_second_level_page_header, encodingsPageOffset)); } uint16_t encodingsCount() const { return _addressSpace.get16( _addr + offsetof(unwind_info_compressed_second_level_page_header, encodingsCount)); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionCompressedArray { public: UnwindSectionCompressedArray(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t functionOffset(uint32_t index) const { return UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET( _addressSpace.get32(_addr + index * sizeof(uint32_t))); } uint16_t encodingIndex(uint32_t index) const { return UNWIND_INFO_COMPRESSED_ENTRY_ENCODING_INDEX( _addressSpace.get32(_addr + index * sizeof(uint32_t))); } private: A &_addressSpace; typename A::pint_t _addr; }; template class UnwindSectionLsdaArray { public: UnwindSectionLsdaArray(A &addressSpace, typename A::pint_t addr) : _addressSpace(addressSpace), _addr(addr) {} uint32_t functionOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_section_header_lsda_index_entry, index, functionOffset)); } uint32_t lsdaOffset(uint32_t index) const { return _addressSpace.get32( _addr + arrayoffsetof(unwind_info_section_header_lsda_index_entry, index, lsdaOffset)); } private: A &_addressSpace; typename A::pint_t _addr; }; #endif // _LIBUNWIND_SUPPORT_COMPACT_UNWIND class _LIBUNWIND_HIDDEN AbstractUnwindCursor { public: // NOTE: provide a class specific placement deallocation function (S5.3.4 p20) // This avoids an unnecessary dependency to libc++abi. void operator delete(void *, size_t) {} virtual ~AbstractUnwindCursor() {} virtual bool validReg(int) { _LIBUNWIND_ABORT("validReg not implemented"); } virtual unw_word_t getReg(int) { _LIBUNWIND_ABORT("getReg not implemented"); } virtual void setReg(int, unw_word_t) { _LIBUNWIND_ABORT("setReg not implemented"); } virtual bool validFloatReg(int) { _LIBUNWIND_ABORT("validFloatReg not implemented"); } virtual unw_fpreg_t getFloatReg(int) { _LIBUNWIND_ABORT("getFloatReg not implemented"); } virtual void setFloatReg(int, unw_fpreg_t) { _LIBUNWIND_ABORT("setFloatReg not implemented"); } virtual int step() { _LIBUNWIND_ABORT("step not implemented"); } virtual void getInfo(unw_proc_info_t *) { _LIBUNWIND_ABORT("getInfo not implemented"); } virtual void jumpto() { _LIBUNWIND_ABORT("jumpto not implemented"); } virtual bool isSignalFrame() { _LIBUNWIND_ABORT("isSignalFrame not implemented"); } virtual bool getFunctionName(char *, size_t, unw_word_t *) { _LIBUNWIND_ABORT("getFunctionName not implemented"); } virtual void setInfoBasedOnIPRegister(bool = false) { _LIBUNWIND_ABORT("setInfoBasedOnIPRegister not implemented"); } virtual const char *getRegisterName(int) { _LIBUNWIND_ABORT("getRegisterName not implemented"); } #ifdef __arm__ virtual void saveVFPAsX() { _LIBUNWIND_ABORT("saveVFPAsX not implemented"); } #endif }; /// UnwindCursor contains all state (including all register values) during /// an unwind. This is normally stack allocated inside a unw_cursor_t. template class UnwindCursor : public AbstractUnwindCursor{ typedef typename A::pint_t pint_t; public: UnwindCursor(unw_context_t *context, A &as); UnwindCursor(A &as, void *threadArg); virtual ~UnwindCursor() {} virtual bool validReg(int); virtual unw_word_t getReg(int); virtual void setReg(int, unw_word_t); virtual bool validFloatReg(int); virtual unw_fpreg_t getFloatReg(int); virtual void setFloatReg(int, unw_fpreg_t); virtual int step(); virtual void getInfo(unw_proc_info_t *); virtual void jumpto(); virtual bool isSignalFrame(); virtual bool getFunctionName(char *buf, size_t len, unw_word_t *off); virtual void setInfoBasedOnIPRegister(bool isReturnAddress = false); virtual const char *getRegisterName(int num); #ifdef __arm__ virtual void saveVFPAsX(); #endif private: #if _LIBUNWIND_ARM_EHABI bool getInfoFromEHABISection(pint_t pc, const UnwindInfoSections §s); int stepWithEHABI() { size_t len = 0; size_t off = 0; // FIXME: Calling decode_eht_entry() here is violating the libunwind // abstraction layer. const uint32_t *ehtp = decode_eht_entry(reinterpret_cast(_info.unwind_info), &off, &len); if (_Unwind_VRS_Interpret((_Unwind_Context *)this, ehtp, off, len) != _URC_CONTINUE_UNWIND) return UNW_STEP_END; return UNW_STEP_SUCCESS; } #endif #if _LIBUNWIND_SUPPORT_DWARF_UNWIND bool getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, uint32_t fdeSectionOffsetHint=0); int stepWithDwarfFDE() { return DwarfInstructions::stepWithDwarf(_addressSpace, (pint_t)this->getReg(UNW_REG_IP), (pint_t)_info.unwind_info, _registers); } #endif #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND bool getInfoFromCompactEncodingSection(pint_t pc, const UnwindInfoSections §s); int stepWithCompactEncoding() { #if _LIBUNWIND_SUPPORT_DWARF_UNWIND if ( compactSaysUseDwarf() ) return stepWithDwarfFDE(); #endif R dummy; return stepWithCompactEncoding(dummy); } #if defined(_LIBUNWIND_TARGET_X86_64) int stepWithCompactEncoding(Registers_x86_64 &) { return CompactUnwinder_x86_64::stepWithCompactEncoding( _info.format, _info.start_ip, _addressSpace, _registers); } #endif #if defined(_LIBUNWIND_TARGET_I386) int stepWithCompactEncoding(Registers_x86 &) { return CompactUnwinder_x86::stepWithCompactEncoding( _info.format, (uint32_t)_info.start_ip, _addressSpace, _registers); } #endif #if defined(_LIBUNWIND_TARGET_PPC) int stepWithCompactEncoding(Registers_ppc &) { return UNW_EINVAL; } #endif #if defined(_LIBUNWIND_TARGET_AARCH64) int stepWithCompactEncoding(Registers_arm64 &) { return CompactUnwinder_arm64::stepWithCompactEncoding( _info.format, _info.start_ip, _addressSpace, _registers); } #endif bool compactSaysUseDwarf(uint32_t *offset=NULL) const { R dummy; return compactSaysUseDwarf(dummy, offset); } #if defined(_LIBUNWIND_TARGET_X86_64) bool compactSaysUseDwarf(Registers_x86_64 &, uint32_t *offset) const { if ((_info.format & UNWIND_X86_64_MODE_MASK) == UNWIND_X86_64_MODE_DWARF) { if (offset) *offset = (_info.format & UNWIND_X86_64_DWARF_SECTION_OFFSET); return true; } return false; } #endif #if defined(_LIBUNWIND_TARGET_I386) bool compactSaysUseDwarf(Registers_x86 &, uint32_t *offset) const { if ((_info.format & UNWIND_X86_MODE_MASK) == UNWIND_X86_MODE_DWARF) { if (offset) *offset = (_info.format & UNWIND_X86_DWARF_SECTION_OFFSET); return true; } return false; } #endif #if defined(_LIBUNWIND_TARGET_PPC) bool compactSaysUseDwarf(Registers_ppc &, uint32_t *) const { return true; } #endif #if defined(_LIBUNWIND_TARGET_AARCH64) bool compactSaysUseDwarf(Registers_arm64 &, uint32_t *offset) const { if ((_info.format & UNWIND_ARM64_MODE_MASK) == UNWIND_ARM64_MODE_DWARF) { if (offset) *offset = (_info.format & UNWIND_ARM64_DWARF_SECTION_OFFSET); return true; } return false; } #endif #endif // _LIBUNWIND_SUPPORT_COMPACT_UNWIND #if _LIBUNWIND_SUPPORT_DWARF_UNWIND compact_unwind_encoding_t dwarfEncoding() const { R dummy; return dwarfEncoding(dummy); } #if defined(_LIBUNWIND_TARGET_X86_64) compact_unwind_encoding_t dwarfEncoding(Registers_x86_64 &) const { return UNWIND_X86_64_MODE_DWARF; } #endif #if defined(_LIBUNWIND_TARGET_I386) compact_unwind_encoding_t dwarfEncoding(Registers_x86 &) const { return UNWIND_X86_MODE_DWARF; } #endif #if defined(_LIBUNWIND_TARGET_PPC) compact_unwind_encoding_t dwarfEncoding(Registers_ppc &) const { return 0; } #endif #if defined(_LIBUNWIND_TARGET_AARCH64) compact_unwind_encoding_t dwarfEncoding(Registers_arm64 &) const { return UNWIND_ARM64_MODE_DWARF; } #endif #if defined (_LIBUNWIND_TARGET_OR1K) compact_unwind_encoding_t dwarfEncoding(Registers_or1k &) const { return 0; } #endif #if defined (_LIBUNWIND_TARGET_RISCV) compact_unwind_encoding_t dwarfEncoding(Registers_riscv &) const { return 0; } #endif #endif // _LIBUNWIND_SUPPORT_DWARF_UNWIND A &_addressSpace; R _registers; unw_proc_info_t _info; bool _unwindInfoMissing; bool _isSignalFrame; }; template UnwindCursor::UnwindCursor(unw_context_t *context, A &as) : _addressSpace(as), _registers(context), _unwindInfoMissing(false), _isSignalFrame(false) { static_assert((check_fit, unw_cursor_t>::does_fit), "UnwindCursor<> does not fit in unw_cursor_t"); memset(&_info, 0, sizeof(_info)); } template UnwindCursor::UnwindCursor(A &as, void *) : _addressSpace(as), _unwindInfoMissing(false), _isSignalFrame(false) { memset(&_info, 0, sizeof(_info)); // FIXME // fill in _registers from thread arg } template bool UnwindCursor::validReg(int regNum) { return _registers.validRegister(regNum); } template unw_word_t UnwindCursor::getReg(int regNum) { return _registers.getRegister(regNum); } template void UnwindCursor::setReg(int regNum, unw_word_t value) { _registers.setRegister(regNum, (typename A::pint_t)value); } template bool UnwindCursor::validFloatReg(int regNum) { return _registers.validFloatRegister(regNum); } template unw_fpreg_t UnwindCursor::getFloatReg(int regNum) { return _registers.getFloatRegister(regNum); } template void UnwindCursor::setFloatReg(int regNum, unw_fpreg_t value) { _registers.setFloatRegister(regNum, value); } template void UnwindCursor::jumpto() { _registers.jumpto(); } #ifdef __arm__ template void UnwindCursor::saveVFPAsX() { _registers.saveVFPAsX(); } #endif template const char *UnwindCursor::getRegisterName(int regNum) { return _registers.getRegisterName(regNum); } template bool UnwindCursor::isSignalFrame() { return _isSignalFrame; } #if _LIBUNWIND_ARM_EHABI struct EHABIIndexEntry { uint32_t functionOffset; uint32_t data; }; template struct EHABISectionIterator { typedef EHABISectionIterator _Self; typedef std::random_access_iterator_tag iterator_category; typedef typename A::pint_t value_type; typedef typename A::pint_t* pointer; typedef typename A::pint_t& reference; typedef size_t size_type; typedef size_t difference_type; static _Self begin(A& addressSpace, const UnwindInfoSections& sects) { return _Self(addressSpace, sects, 0); } static _Self end(A& addressSpace, const UnwindInfoSections& sects) { return _Self(addressSpace, sects, sects.arm_section_length); } EHABISectionIterator(A& addressSpace, const UnwindInfoSections& sects, size_t i) : _i(i), _addressSpace(&addressSpace), _sects(§s) {} _Self& operator++() { ++_i; return *this; } _Self& operator+=(size_t a) { _i += a; return *this; } _Self& operator--() { assert(_i > 0); --_i; return *this; } _Self& operator-=(size_t a) { assert(_i >= a); _i -= a; return *this; } _Self operator+(size_t a) { _Self out = *this; out._i += a; return out; } _Self operator-(size_t a) { assert(_i >= a); _Self out = *this; out._i -= a; return out; } size_t operator-(const _Self& other) { return _i - other._i; } bool operator==(const _Self& other) const { assert(_addressSpace == other._addressSpace); assert(_sects == other._sects); return _i == other._i; } typename A::pint_t operator*() const { return functionAddress(); } typename A::pint_t functionAddress() const { typename A::pint_t indexAddr = _sects->arm_section + arrayoffsetof( EHABIIndexEntry, _i, functionOffset); return indexAddr + signExtendPrel31(_addressSpace->get32(indexAddr)); } typename A::pint_t dataAddress() { typename A::pint_t indexAddr = _sects->arm_section + arrayoffsetof( EHABIIndexEntry, _i, data); return indexAddr; } private: size_t _i; A* _addressSpace; const UnwindInfoSections* _sects; }; template bool UnwindCursor::getInfoFromEHABISection( pint_t pc, const UnwindInfoSections §s) { EHABISectionIterator begin = EHABISectionIterator::begin(_addressSpace, sects); EHABISectionIterator end = EHABISectionIterator::end(_addressSpace, sects); EHABISectionIterator itNextPC = std::upper_bound(begin, end, pc); if (itNextPC == begin || itNextPC == end) return false; EHABISectionIterator itThisPC = itNextPC - 1; pint_t thisPC = itThisPC.functionAddress(); pint_t nextPC = itNextPC.functionAddress(); pint_t indexDataAddr = itThisPC.dataAddress(); if (indexDataAddr == 0) return false; uint32_t indexData = _addressSpace.get32(indexDataAddr); if (indexData == UNW_EXIDX_CANTUNWIND) return false; // If the high bit is set, the exception handling table entry is inline inside // the index table entry on the second word (aka |indexDataAddr|). Otherwise, // the table points at an offset in the exception handling table (section 5 EHABI). pint_t exceptionTableAddr; uint32_t exceptionTableData; bool isSingleWordEHT; if (indexData & 0x80000000) { exceptionTableAddr = indexDataAddr; // TODO(ajwong): Should this data be 0? exceptionTableData = indexData; isSingleWordEHT = true; } else { exceptionTableAddr = indexDataAddr + signExtendPrel31(indexData); exceptionTableData = _addressSpace.get32(exceptionTableAddr); isSingleWordEHT = false; } // Now we know the 3 things: // exceptionTableAddr -- exception handler table entry. // exceptionTableData -- the data inside the first word of the eht entry. // isSingleWordEHT -- whether the entry is in the index. unw_word_t personalityRoutine = 0xbadf00d; bool scope32 = false; uintptr_t lsda; // If the high bit in the exception handling table entry is set, the entry is // in compact form (section 6.3 EHABI). if (exceptionTableData & 0x80000000) { // Grab the index of the personality routine from the compact form. uint32_t choice = (exceptionTableData & 0x0f000000) >> 24; uint32_t extraWords = 0; switch (choice) { case 0: personalityRoutine = (unw_word_t) &__aeabi_unwind_cpp_pr0; extraWords = 0; scope32 = false; lsda = isSingleWordEHT ? 0 : (exceptionTableAddr + 4); break; case 1: personalityRoutine = (unw_word_t) &__aeabi_unwind_cpp_pr1; extraWords = (exceptionTableData & 0x00ff0000) >> 16; scope32 = false; lsda = exceptionTableAddr + (extraWords + 1) * 4; break; case 2: personalityRoutine = (unw_word_t) &__aeabi_unwind_cpp_pr2; extraWords = (exceptionTableData & 0x00ff0000) >> 16; scope32 = true; lsda = exceptionTableAddr + (extraWords + 1) * 4; break; default: _LIBUNWIND_ABORT("unknown personality routine"); return false; } if (isSingleWordEHT) { if (extraWords != 0) { _LIBUNWIND_ABORT("index inlined table detected but pr function " "requires extra words"); return false; } } } else { pint_t personalityAddr = exceptionTableAddr + signExtendPrel31(exceptionTableData); personalityRoutine = personalityAddr; // ARM EHABI # 6.2, # 9.2 // // +---- ehtp // v // +--------------------------------------+ // | +--------+--------+--------+-------+ | // | |0| prel31 to personalityRoutine | | // | +--------+--------+--------+-------+ | // | | N | unwind opcodes | | <-- UnwindData // | +--------+--------+--------+-------+ | // | | Word 2 unwind opcodes | | // | +--------+--------+--------+-------+ | // | ... | // | +--------+--------+--------+-------+ | // | | Word N unwind opcodes | | // | +--------+--------+--------+-------+ | // | | LSDA | | <-- lsda // | | ... | | // | +--------+--------+--------+-------+ | // +--------------------------------------+ uint32_t *UnwindData = reinterpret_cast(exceptionTableAddr) + 1; uint32_t FirstDataWord = *UnwindData; size_t N = ((FirstDataWord >> 24) & 0xff); size_t NDataWords = N + 1; lsda = reinterpret_cast(UnwindData + NDataWords); } _info.start_ip = thisPC; _info.end_ip = nextPC; _info.handler = personalityRoutine; _info.unwind_info = exceptionTableAddr; _info.lsda = lsda; // flags is pr_cache.additional. See EHABI #7.2 for definition of bit 0. _info.flags = isSingleWordEHT ? 1 : 0 | scope32 ? 0x2 : 0; // Use enum? return true; } #endif #if _LIBUNWIND_SUPPORT_DWARF_UNWIND template bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, uint32_t fdeSectionOffsetHint) { typename CFI_Parser::FDE_Info fdeInfo; typename CFI_Parser::CIE_Info cieInfo; bool foundFDE = false; bool foundInCache = false; // If compact encoding table gave offset into dwarf section, go directly there if (fdeSectionOffsetHint != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, (uint32_t)sects.dwarf_section_length, sects.dwarf_section + fdeSectionOffsetHint, &fdeInfo, &cieInfo); } #if _LIBUNWIND_SUPPORT_DWARF_INDEX if (!foundFDE && (sects.dwarf_index_section != 0)) { foundFDE = EHHeaderParser::findFDE( _addressSpace, pc, sects.dwarf_index_section, (uint32_t)sects.dwarf_index_section_length, &fdeInfo, &cieInfo); } #endif if (!foundFDE) { // otherwise, search cache of previously found FDEs. pint_t cachedFDE = DwarfFDECache::findFDE(sects.dso_base, pc); if (cachedFDE != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, (uint32_t)sects.dwarf_section_length, cachedFDE, &fdeInfo, &cieInfo); foundInCache = foundFDE; } } if (!foundFDE) { // Still not found, do full scan of __eh_frame section. foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, (uint32_t)sects.dwarf_section_length, 0, &fdeInfo, &cieInfo); } if (foundFDE) { typename CFI_Parser::PrologInfo prolog; if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, &prolog)) { // Save off parsed FDE info _info.start_ip = fdeInfo.pcStart; _info.end_ip = fdeInfo.pcEnd; _info.lsda = fdeInfo.lsda; _info.handler = cieInfo.personality; _info.gp = prolog.spExtraArgSize; _info.flags = 0; _info.format = dwarfEncoding(); _info.unwind_info = fdeInfo.fdeStart; _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; _info.extra = (unw_word_t) sects.dso_base; // Add to cache (to make next lookup faster) if we had no hint // and there was no index. if (!foundInCache && (fdeSectionOffsetHint == 0)) { #if _LIBUNWIND_SUPPORT_DWARF_INDEX if (sects.dwarf_index_section == 0) #endif DwarfFDECache::add(sects.dso_base, fdeInfo.pcStart, fdeInfo.pcEnd, fdeInfo.fdeStart); } return true; } } - //_LIBUNWIND_DEBUG_LOG("can't find/use FDE for pc=0x%llX\n", (uint64_t)pc); + //_LIBUNWIND_DEBUG_LOG("can't find/use FDE for pc=0x%llX", (uint64_t)pc); return false; } #endif // _LIBUNWIND_SUPPORT_DWARF_UNWIND #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND template bool UnwindCursor::getInfoFromCompactEncodingSection(pint_t pc, const UnwindInfoSections §s) { const bool log = false; if (log) fprintf(stderr, "getInfoFromCompactEncodingSection(pc=0x%llX, mh=0x%llX)\n", (uint64_t)pc, (uint64_t)sects.dso_base); const UnwindSectionHeader sectionHeader(_addressSpace, sects.compact_unwind_section); if (sectionHeader.version() != UNWIND_SECTION_VERSION) return false; // do a binary search of top level index to find page with unwind info pint_t targetFunctionOffset = pc - sects.dso_base; const UnwindSectionIndexArray topIndex(_addressSpace, sects.compact_unwind_section + sectionHeader.indexSectionOffset()); uint32_t low = 0; uint32_t high = sectionHeader.indexCount(); uint32_t last = high - 1; while (low < high) { uint32_t mid = (low + high) / 2; //if ( log ) fprintf(stderr, "\tmid=%d, low=%d, high=%d, *mid=0x%08X\n", //mid, low, high, topIndex.functionOffset(mid)); if (topIndex.functionOffset(mid) <= targetFunctionOffset) { if ((mid == last) || (topIndex.functionOffset(mid + 1) > targetFunctionOffset)) { low = mid; break; } else { low = mid + 1; } } else { high = mid; } } const uint32_t firstLevelFunctionOffset = topIndex.functionOffset(low); const uint32_t firstLevelNextPageFunctionOffset = topIndex.functionOffset(low + 1); const pint_t secondLevelAddr = sects.compact_unwind_section + topIndex.secondLevelPagesSectionOffset(low); const pint_t lsdaArrayStartAddr = sects.compact_unwind_section + topIndex.lsdaIndexArraySectionOffset(low); const pint_t lsdaArrayEndAddr = sects.compact_unwind_section + topIndex.lsdaIndexArraySectionOffset(low+1); if (log) fprintf(stderr, "\tfirst level search for result index=%d " "to secondLevelAddr=0x%llX\n", low, (uint64_t) secondLevelAddr); // do a binary search of second level page index uint32_t encoding = 0; pint_t funcStart = 0; pint_t funcEnd = 0; pint_t lsda = 0; pint_t personality = 0; uint32_t pageKind = _addressSpace.get32(secondLevelAddr); if (pageKind == UNWIND_SECOND_LEVEL_REGULAR) { // regular page UnwindSectionRegularPageHeader pageHeader(_addressSpace, secondLevelAddr); UnwindSectionRegularArray pageIndex( _addressSpace, secondLevelAddr + pageHeader.entryPageOffset()); // binary search looks for entry with e where index[e].offset <= pc < // index[e+1].offset if (log) fprintf(stderr, "\tbinary search for targetFunctionOffset=0x%08llX in " "regular page starting at secondLevelAddr=0x%llX\n", (uint64_t) targetFunctionOffset, (uint64_t) secondLevelAddr); low = 0; high = pageHeader.entryCount(); while (low < high) { uint32_t mid = (low + high) / 2; if (pageIndex.functionOffset(mid) <= targetFunctionOffset) { if (mid == (uint32_t)(pageHeader.entryCount() - 1)) { // at end of table low = mid; funcEnd = firstLevelNextPageFunctionOffset + sects.dso_base; break; } else if (pageIndex.functionOffset(mid + 1) > targetFunctionOffset) { // next is too big, so we found it low = mid; funcEnd = pageIndex.functionOffset(low + 1) + sects.dso_base; break; } else { low = mid + 1; } } else { high = mid; } } encoding = pageIndex.encoding(low); funcStart = pageIndex.functionOffset(low) + sects.dso_base; if (pc < funcStart) { if (log) fprintf( stderr, "\tpc not in table, pc=0x%llX, funcStart=0x%llX, funcEnd=0x%llX\n", (uint64_t) pc, (uint64_t) funcStart, (uint64_t) funcEnd); return false; } if (pc > funcEnd) { if (log) fprintf( stderr, "\tpc not in table, pc=0x%llX, funcStart=0x%llX, funcEnd=0x%llX\n", (uint64_t) pc, (uint64_t) funcStart, (uint64_t) funcEnd); return false; } } else if (pageKind == UNWIND_SECOND_LEVEL_COMPRESSED) { // compressed page UnwindSectionCompressedPageHeader pageHeader(_addressSpace, secondLevelAddr); UnwindSectionCompressedArray pageIndex( _addressSpace, secondLevelAddr + pageHeader.entryPageOffset()); const uint32_t targetFunctionPageOffset = (uint32_t)(targetFunctionOffset - firstLevelFunctionOffset); // binary search looks for entry with e where index[e].offset <= pc < // index[e+1].offset if (log) fprintf(stderr, "\tbinary search of compressed page starting at " "secondLevelAddr=0x%llX\n", (uint64_t) secondLevelAddr); low = 0; last = pageHeader.entryCount() - 1; high = pageHeader.entryCount(); while (low < high) { uint32_t mid = (low + high) / 2; if (pageIndex.functionOffset(mid) <= targetFunctionPageOffset) { if ((mid == last) || (pageIndex.functionOffset(mid + 1) > targetFunctionPageOffset)) { low = mid; break; } else { low = mid + 1; } } else { high = mid; } } funcStart = pageIndex.functionOffset(low) + firstLevelFunctionOffset + sects.dso_base; if (low < last) funcEnd = pageIndex.functionOffset(low + 1) + firstLevelFunctionOffset + sects.dso_base; else funcEnd = firstLevelNextPageFunctionOffset + sects.dso_base; if (pc < funcStart) { _LIBUNWIND_DEBUG_LOG("malformed __unwind_info, pc=0x%llX not in second " - "level compressed unwind table. funcStart=0x%llX\n", + "level compressed unwind table. funcStart=0x%llX", (uint64_t) pc, (uint64_t) funcStart); return false; } if (pc > funcEnd) { _LIBUNWIND_DEBUG_LOG("malformed __unwind_info, pc=0x%llX not in second " - "level compressed unwind table. funcEnd=0x%llX\n", + "level compressed unwind table. funcEnd=0x%llX", (uint64_t) pc, (uint64_t) funcEnd); return false; } uint16_t encodingIndex = pageIndex.encodingIndex(low); if (encodingIndex < sectionHeader.commonEncodingsArrayCount()) { // encoding is in common table in section header encoding = _addressSpace.get32( sects.compact_unwind_section + sectionHeader.commonEncodingsArraySectionOffset() + encodingIndex * sizeof(uint32_t)); } else { // encoding is in page specific table uint16_t pageEncodingIndex = encodingIndex - (uint16_t)sectionHeader.commonEncodingsArrayCount(); encoding = _addressSpace.get32(secondLevelAddr + pageHeader.encodingsPageOffset() + pageEncodingIndex * sizeof(uint32_t)); } } else { _LIBUNWIND_DEBUG_LOG("malformed __unwind_info at 0x%0llX bad second " - "level page\n", + "level page", (uint64_t) sects.compact_unwind_section); return false; } // look up LSDA, if encoding says function has one if (encoding & UNWIND_HAS_LSDA) { UnwindSectionLsdaArray lsdaIndex(_addressSpace, lsdaArrayStartAddr); uint32_t funcStartOffset = (uint32_t)(funcStart - sects.dso_base); low = 0; high = (uint32_t)(lsdaArrayEndAddr - lsdaArrayStartAddr) / sizeof(unwind_info_section_header_lsda_index_entry); // binary search looks for entry with exact match for functionOffset if (log) fprintf(stderr, "\tbinary search of lsda table for targetFunctionOffset=0x%08X\n", funcStartOffset); while (low < high) { uint32_t mid = (low + high) / 2; if (lsdaIndex.functionOffset(mid) == funcStartOffset) { lsda = lsdaIndex.lsdaOffset(mid) + sects.dso_base; break; } else if (lsdaIndex.functionOffset(mid) < funcStartOffset) { low = mid + 1; } else { high = mid; } } if (lsda == 0) { _LIBUNWIND_DEBUG_LOG("found encoding 0x%08X with HAS_LSDA bit set for " - "pc=0x%0llX, but lsda table has no entry\n", + "pc=0x%0llX, but lsda table has no entry", encoding, (uint64_t) pc); return false; } } // extact personality routine, if encoding says function has one uint32_t personalityIndex = (encoding & UNWIND_PERSONALITY_MASK) >> (__builtin_ctz(UNWIND_PERSONALITY_MASK)); if (personalityIndex != 0) { --personalityIndex; // change 1-based to zero-based index if (personalityIndex > sectionHeader.personalityArrayCount()) { _LIBUNWIND_DEBUG_LOG("found encoding 0x%08X with personality index %d, " - "but personality table has only %d entires\n", + "but personality table has only %d entires", encoding, personalityIndex, sectionHeader.personalityArrayCount()); return false; } int32_t personalityDelta = (int32_t)_addressSpace.get32( sects.compact_unwind_section + sectionHeader.personalityArraySectionOffset() + personalityIndex * sizeof(uint32_t)); pint_t personalityPointer = sects.dso_base + (pint_t)personalityDelta; personality = _addressSpace.getP(personalityPointer); if (log) fprintf(stderr, "getInfoFromCompactEncodingSection(pc=0x%llX), " "personalityDelta=0x%08X, personality=0x%08llX\n", (uint64_t) pc, personalityDelta, (uint64_t) personality); } if (log) fprintf(stderr, "getInfoFromCompactEncodingSection(pc=0x%llX), " "encoding=0x%08X, lsda=0x%08llX for funcStart=0x%llX\n", (uint64_t) pc, encoding, (uint64_t) lsda, (uint64_t) funcStart); _info.start_ip = funcStart; _info.end_ip = funcEnd; _info.lsda = lsda; _info.handler = personality; _info.gp = 0; _info.flags = 0; _info.format = encoding; _info.unwind_info = 0; _info.unwind_info_size = 0; _info.extra = sects.dso_base; return true; } #endif // _LIBUNWIND_SUPPORT_COMPACT_UNWIND template void UnwindCursor::setInfoBasedOnIPRegister(bool isReturnAddress) { pint_t pc = (pint_t)this->getReg(UNW_REG_IP); #if _LIBUNWIND_ARM_EHABI // Remove the thumb bit so the IP represents the actual instruction address. // This matches the behaviour of _Unwind_GetIP on arm. pc &= (pint_t)~0x1; #endif // If the last line of a function is a "throw" the compiler sometimes // emits no instructions after the call to __cxa_throw. This means // the return address is actually the start of the next function. // To disambiguate this, back up the pc when we know it is a return // address. if (isReturnAddress) --pc; // Ask address space object to find unwind sections for this pc. UnwindInfoSections sects; if (_addressSpace.findUnwindSections(pc, sects)) { #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND // If there is a compact unwind encoding table, look there first. if (sects.compact_unwind_section != 0) { if (this->getInfoFromCompactEncodingSection(pc, sects)) { #if _LIBUNWIND_SUPPORT_DWARF_UNWIND // Found info in table, done unless encoding says to use dwarf. uint32_t dwarfOffset; if ((sects.dwarf_section != 0) && compactSaysUseDwarf(&dwarfOffset)) { if (this->getInfoFromDwarfSection(pc, sects, dwarfOffset)) { // found info in dwarf, done return; } } #endif // If unwind table has entry, but entry says there is no unwind info, // record that we have no unwind info. if (_info.format == 0) _unwindInfoMissing = true; return; } } #endif // _LIBUNWIND_SUPPORT_COMPACT_UNWIND #if _LIBUNWIND_SUPPORT_DWARF_UNWIND // If there is dwarf unwind info, look there next. if (sects.dwarf_section != 0) { if (this->getInfoFromDwarfSection(pc, sects)) { // found info in dwarf, done return; } } #endif #if _LIBUNWIND_ARM_EHABI // If there is ARM EHABI unwind info, look there next. if (sects.arm_section != 0 && this->getInfoFromEHABISection(pc, sects)) return; #endif } #if _LIBUNWIND_SUPPORT_DWARF_UNWIND // There is no static unwind info for this pc. Look to see if an FDE was // dynamically registered for it. pint_t cachedFDE = DwarfFDECache::findFDE(0, pc); if (cachedFDE != 0) { CFI_Parser::FDE_Info fdeInfo; CFI_Parser::CIE_Info cieInfo; const char *msg = CFI_Parser::decodeFDE(_addressSpace, cachedFDE, &fdeInfo, &cieInfo); if (msg == NULL) { typename CFI_Parser::PrologInfo prolog; if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, &prolog)) { // save off parsed FDE info _info.start_ip = fdeInfo.pcStart; _info.end_ip = fdeInfo.pcEnd; _info.lsda = fdeInfo.lsda; _info.handler = cieInfo.personality; _info.gp = prolog.spExtraArgSize; // Some frameless functions need SP // altered when resuming in function. _info.flags = 0; _info.format = dwarfEncoding(); _info.unwind_info = fdeInfo.fdeStart; _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; _info.extra = 0; return; } } } // Lastly, ask AddressSpace object about platform specific ways to locate // other FDEs. pint_t fde; if (_addressSpace.findOtherFDE(pc, fde)) { CFI_Parser::FDE_Info fdeInfo; CFI_Parser::CIE_Info cieInfo; if (!CFI_Parser::decodeFDE(_addressSpace, fde, &fdeInfo, &cieInfo)) { // Double check this FDE is for a function that includes the pc. if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { typename CFI_Parser::PrologInfo prolog; if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, &prolog)) { // save off parsed FDE info _info.start_ip = fdeInfo.pcStart; _info.end_ip = fdeInfo.pcEnd; _info.lsda = fdeInfo.lsda; _info.handler = cieInfo.personality; _info.gp = prolog.spExtraArgSize; _info.flags = 0; _info.format = dwarfEncoding(); _info.unwind_info = fdeInfo.fdeStart; _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; _info.extra = 0; return; } } } } #endif // #if _LIBUNWIND_SUPPORT_DWARF_UNWIND // no unwind info, flag that we can't reliably unwind _unwindInfoMissing = true; } template int UnwindCursor::step() { // Bottom of stack is defined is when unwind info cannot be found. if (_unwindInfoMissing) return UNW_STEP_END; // Use unwinding info to modify register set as if function returned. int result; #if _LIBUNWIND_SUPPORT_COMPACT_UNWIND result = this->stepWithCompactEncoding(); #elif _LIBUNWIND_SUPPORT_DWARF_UNWIND result = this->stepWithDwarfFDE(); #elif _LIBUNWIND_ARM_EHABI result = this->stepWithEHABI(); #else #error Need _LIBUNWIND_SUPPORT_COMPACT_UNWIND or \ _LIBUNWIND_SUPPORT_DWARF_UNWIND or \ _LIBUNWIND_ARM_EHABI #endif // update info based on new PC if (result == UNW_STEP_SUCCESS) { this->setInfoBasedOnIPRegister(true); if (_unwindInfoMissing) return UNW_STEP_END; if (_info.gp) setReg(UNW_REG_SP, getReg(UNW_REG_SP) + _info.gp); } return result; } template void UnwindCursor::getInfo(unw_proc_info_t *info) { *info = _info; } template bool UnwindCursor::getFunctionName(char *buf, size_t bufLen, unw_word_t *offset) { return _addressSpace.findFunctionName((pint_t)this->getReg(UNW_REG_IP), buf, bufLen, offset); } } // namespace libunwind #endif // __UNWINDCURSOR_HPP__ Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1-gcc-ext.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1-gcc-ext.c (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1-gcc-ext.c (revision 308054) @@ -1,316 +1,316 @@ //===--------------------- UnwindLevel1-gcc-ext.c -------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Implements gcc extensions to the C++ ABI Exception Handling Level 1. // //===----------------------------------------------------------------------===// #include #include #include #include #include #include #include "config.h" #include "libunwind_ext.h" #include "libunwind.h" #include "Unwind-EHABI.h" #include "unwind.h" #if _LIBUNWIND_BUILD_ZERO_COST_APIS /// Called by __cxa_rethrow(). _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *exception_object) { #if _LIBUNWIND_ARM_EHABI - _LIBUNWIND_TRACE_API("_Unwind_Resume_or_Rethrow(ex_obj=%p), private_1=%ld\n", + _LIBUNWIND_TRACE_API("_Unwind_Resume_or_Rethrow(ex_obj=%p), private_1=%ld", (void *)exception_object, (long)exception_object->unwinder_cache.reserved1); #else - _LIBUNWIND_TRACE_API("_Unwind_Resume_or_Rethrow(ex_obj=%p), private_1=%ld\n", + _LIBUNWIND_TRACE_API("_Unwind_Resume_or_Rethrow(ex_obj=%p), private_1=%ld", (void *)exception_object, (long)exception_object->private_1); #endif #if _LIBUNWIND_ARM_EHABI // _Unwind_RaiseException on EHABI will always set the reserved1 field to 0, // which is in the same position as private_1 below. return _Unwind_RaiseException(exception_object); #else // If this is non-forced and a stopping place was found, then this is a // re-throw. // Call _Unwind_RaiseException() as if this was a new exception if (exception_object->private_1 == 0) { return _Unwind_RaiseException(exception_object); // Will return if there is no catch clause, so that __cxa_rethrow can call // std::terminate(). } // Call through to _Unwind_Resume() which distiguishes between forced and // regular exceptions. _Unwind_Resume(exception_object); _LIBUNWIND_ABORT("_Unwind_Resume_or_Rethrow() called _Unwind_RaiseException()" " which unexpectedly returned"); #endif } /// Called by personality handler during phase 2 to get base address for data /// relative encodings. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetDataRelBase(struct _Unwind_Context *context) { (void)context; - _LIBUNWIND_TRACE_API("_Unwind_GetDataRelBase(context=%p)\n", (void *)context); + _LIBUNWIND_TRACE_API("_Unwind_GetDataRelBase(context=%p)", (void *)context); _LIBUNWIND_ABORT("_Unwind_GetDataRelBase() not implemented"); } /// Called by personality handler during phase 2 to get base address for text /// relative encodings. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetTextRelBase(struct _Unwind_Context *context) { (void)context; - _LIBUNWIND_TRACE_API("_Unwind_GetTextRelBase(context=%p)\n", (void *)context); + _LIBUNWIND_TRACE_API("_Unwind_GetTextRelBase(context=%p)", (void *)context); _LIBUNWIND_ABORT("_Unwind_GetTextRelBase() not implemented"); } /// Scans unwind information to find the function that contains the /// specified code address "pc". _LIBUNWIND_EXPORT void *_Unwind_FindEnclosingFunction(void *pc) { - _LIBUNWIND_TRACE_API("_Unwind_FindEnclosingFunction(pc=%p)\n", pc); + _LIBUNWIND_TRACE_API("_Unwind_FindEnclosingFunction(pc=%p)", pc); // This is slow, but works. // We create an unwind cursor then alter the IP to be pc unw_cursor_t cursor; unw_context_t uc; unw_proc_info_t info; unw_getcontext(&uc); unw_init_local(&cursor, &uc); unw_set_reg(&cursor, UNW_REG_IP, (unw_word_t)(long) pc); if (unw_get_proc_info(&cursor, &info) == UNW_ESUCCESS) return (void *)(long) info.start_ip; else return NULL; } /// Walk every frame and call trace function at each one. If trace function /// returns anything other than _URC_NO_REASON, then walk is terminated. _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn callback, void *ref) { unw_cursor_t cursor; unw_context_t uc; unw_getcontext(&uc); unw_init_local(&cursor, &uc); - _LIBUNWIND_TRACE_API("_Unwind_Backtrace(callback=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_Backtrace(callback=%p)", (void *)(uintptr_t)callback); #if _LIBUNWIND_ARM_EHABI // Create a mock exception object for force unwinding. _Unwind_Exception ex; memset(&ex, '\0', sizeof(ex)); ex.exception_class = 0x434C4E47554E5700; // CLNGUNW\0 #endif // walk each frame while (true) { _Unwind_Reason_Code result; #if !_LIBUNWIND_ARM_EHABI - // ask libuwind to get next frame (skip over first frame which is + // ask libunwind to get next frame (skip over first frame which is // _Unwind_Backtrace()) if (unw_step(&cursor) <= 0) { _LIBUNWIND_TRACE_UNWINDING(" _backtrace: ended because cursor reached " - "bottom of stack, returning %d\n", + "bottom of stack, returning %d", _URC_END_OF_STACK); return _URC_END_OF_STACK; } #else // Get the information for this frame. unw_proc_info_t frameInfo; if (unw_get_proc_info(&cursor, &frameInfo) != UNW_ESUCCESS) { return _URC_END_OF_STACK; } // Update the pr_cache in the mock exception object. const uint32_t* unwindInfo = (uint32_t *) frameInfo.unwind_info; ex.pr_cache.fnstart = frameInfo.start_ip; ex.pr_cache.ehtp = (_Unwind_EHT_Header *) unwindInfo; ex.pr_cache.additional= frameInfo.flags; struct _Unwind_Context *context = (struct _Unwind_Context *)&cursor; // Get and call the personality function to unwind the frame. __personality_routine handler = (__personality_routine) frameInfo.handler; if (handler == NULL) { return _URC_END_OF_STACK; } if (handler(_US_VIRTUAL_UNWIND_FRAME | _US_FORCE_UNWIND, &ex, context) != _URC_CONTINUE_UNWIND) { return _URC_END_OF_STACK; } #endif // _LIBUNWIND_ARM_EHABI // debugging if (_LIBUNWIND_TRACING_UNWINDING) { char functionName[512]; unw_proc_info_t frame; unw_word_t offset; unw_get_proc_name(&cursor, functionName, 512, &offset); unw_get_proc_info(&cursor, &frame); _LIBUNWIND_TRACE_UNWINDING( - " _backtrace: start_ip=0x%llX, func=%s, lsda=0x%llX, context=%p\n", + " _backtrace: start_ip=0x%llX, func=%s, lsda=0x%llX, context=%p", (long long)frame.start_ip, functionName, (long long)frame.lsda, (void *)&cursor); } // call trace function with this frame result = (*callback)((struct _Unwind_Context *)(&cursor), ref); if (result != _URC_NO_REASON) { _LIBUNWIND_TRACE_UNWINDING( - " _backtrace: ended because callback returned %d\n", result); + " _backtrace: ended because callback returned %d", result); return result; } } } /// Find dwarf unwind info for an address 'pc' in some function. _LIBUNWIND_EXPORT const void *_Unwind_Find_FDE(const void *pc, struct dwarf_eh_bases *bases) { // This is slow, but works. // We create an unwind cursor then alter the IP to be pc unw_cursor_t cursor; unw_context_t uc; unw_proc_info_t info; unw_getcontext(&uc); unw_init_local(&cursor, &uc); unw_set_reg(&cursor, UNW_REG_IP, (unw_word_t)(long) pc); unw_get_proc_info(&cursor, &info); bases->tbase = (uintptr_t)info.extra; bases->dbase = 0; // dbase not used on Mac OS X bases->func = (uintptr_t)info.start_ip; - _LIBUNWIND_TRACE_API("_Unwind_Find_FDE(pc=%p) => %p\n", pc, + _LIBUNWIND_TRACE_API("_Unwind_Find_FDE(pc=%p) => %p", pc, (void *)(long) info.unwind_info); return (void *)(long) info.unwind_info; } /// Returns the CFA (call frame area, or stack pointer at start of function) /// for the current context. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetCFA(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_word_t result; unw_get_reg(cursor, UNW_REG_SP, &result); - _LIBUNWIND_TRACE_API("_Unwind_GetCFA(context=%p) => 0x%" PRIx64 "\n", + _LIBUNWIND_TRACE_API("_Unwind_GetCFA(context=%p) => 0x%" PRIx64, (void *)context, (uint64_t)result); return (uintptr_t)result; } /// Called by personality handler during phase 2 to get instruction pointer. /// ipBefore is a boolean that says if IP is already adjusted to be the call /// site address. Normally IP is the return address. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIPInfo(struct _Unwind_Context *context, int *ipBefore) { - _LIBUNWIND_TRACE_API("_Unwind_GetIPInfo(context=%p)\n", (void *)context); + _LIBUNWIND_TRACE_API("_Unwind_GetIPInfo(context=%p)", (void *)context); *ipBefore = 0; return _Unwind_GetIP(context); } #if _LIBUNWIND_SUPPORT_DWARF_UNWIND /// Called by programs with dynamic code generators that want /// to register a dynamically generated FDE. /// This function has existed on Mac OS X since 10.4, but /// was broken until 10.6. _LIBUNWIND_EXPORT void __register_frame(const void *fde) { - _LIBUNWIND_TRACE_API("__register_frame(%p)\n", fde); + _LIBUNWIND_TRACE_API("__register_frame(%p)", fde); _unw_add_dynamic_fde((unw_word_t)(uintptr_t) fde); } /// Called by programs with dynamic code generators that want /// to unregister a dynamically generated FDE. /// This function has existed on Mac OS X since 10.4, but /// was broken until 10.6. _LIBUNWIND_EXPORT void __deregister_frame(const void *fde) { - _LIBUNWIND_TRACE_API("__deregister_frame(%p)\n", fde); + _LIBUNWIND_TRACE_API("__deregister_frame(%p)", fde); _unw_remove_dynamic_fde((unw_word_t)(uintptr_t) fde); } // The following register/deregister functions are gcc extensions. // They have existed on Mac OS X, but have never worked because Mac OS X // before 10.6 used keymgr to track known FDEs, but these functions // never got updated to use keymgr. // For now, we implement these as do-nothing functions to keep any existing // applications working. We also add the not in 10.6 symbol so that nwe // application won't be able to use them. #if _LIBUNWIND_SUPPORT_FRAME_APIS _LIBUNWIND_EXPORT void __register_frame_info_bases(const void *fde, void *ob, void *tb, void *db) { (void)fde; (void)ob; (void)tb; (void)db; - _LIBUNWIND_TRACE_API("__register_frame_info_bases(%p,%p, %p, %p)\n", + _LIBUNWIND_TRACE_API("__register_frame_info_bases(%p,%p, %p, %p)", fde, ob, tb, db); // do nothing, this function never worked in Mac OS X } _LIBUNWIND_EXPORT void __register_frame_info(const void *fde, void *ob) { (void)fde; (void)ob; - _LIBUNWIND_TRACE_API("__register_frame_info(%p, %p)\n", fde, ob); + _LIBUNWIND_TRACE_API("__register_frame_info(%p, %p)", fde, ob); // do nothing, this function never worked in Mac OS X } _LIBUNWIND_EXPORT void __register_frame_info_table_bases(const void *fde, void *ob, void *tb, void *db) { (void)fde; (void)ob; (void)tb; (void)db; _LIBUNWIND_TRACE_API("__register_frame_info_table_bases" - "(%p,%p, %p, %p)\n", fde, ob, tb, db); + "(%p,%p, %p, %p)", fde, ob, tb, db); // do nothing, this function never worked in Mac OS X } _LIBUNWIND_EXPORT void __register_frame_info_table(const void *fde, void *ob) { (void)fde; (void)ob; - _LIBUNWIND_TRACE_API("__register_frame_info_table(%p, %p)\n", fde, ob); + _LIBUNWIND_TRACE_API("__register_frame_info_table(%p, %p)", fde, ob); // do nothing, this function never worked in Mac OS X } _LIBUNWIND_EXPORT void __register_frame_table(const void *fde) { (void)fde; - _LIBUNWIND_TRACE_API("__register_frame_table(%p)\n", fde); + _LIBUNWIND_TRACE_API("__register_frame_table(%p)", fde); // do nothing, this function never worked in Mac OS X } _LIBUNWIND_EXPORT void *__deregister_frame_info(const void *fde) { (void)fde; - _LIBUNWIND_TRACE_API("__deregister_frame_info(%p)\n", fde); + _LIBUNWIND_TRACE_API("__deregister_frame_info(%p)", fde); // do nothing, this function never worked in Mac OS X return NULL; } _LIBUNWIND_EXPORT void *__deregister_frame_info_bases(const void *fde) { (void)fde; - _LIBUNWIND_TRACE_API("__deregister_frame_info_bases(%p)\n", fde); + _LIBUNWIND_TRACE_API("__deregister_frame_info_bases(%p)", fde); // do nothing, this function never worked in Mac OS X return NULL; } #endif // _LIBUNWIND_SUPPORT_FRAME_APIS #endif // _LIBUNWIND_SUPPORT_DWARF_UNWIND #endif // _LIBUNWIND_BUILD_ZERO_COST_APIS Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1.c (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/UnwindLevel1.c (revision 308054) @@ -1,506 +1,506 @@ //===------------------------- UnwindLevel1.c -----------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Implements C++ ABI Exception Handling Level 1 as documented at: // http://mentorembedded.github.io/cxx-abi/abi-eh.html // using libunwind // //===----------------------------------------------------------------------===// // ARM EHABI does not specify _Unwind_{Get,Set}{GR,IP}(). Thus, we are // defining inline functions to delegate the function calls to // _Unwind_VRS_{Get,Set}(). However, some applications might declare the // function protetype directly (instead of including ), thus we need // to export these functions from libunwind.so as well. #define _LIBUNWIND_UNWIND_LEVEL1_EXTERNAL_LINKAGE 1 #include #include #include #include #include #include #include "libunwind.h" #include "unwind.h" #include "config.h" #if !_LIBUNWIND_ARM_EHABI static _Unwind_Reason_Code unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { unw_init_local(cursor, uc); // Walk each frame looking for a place to stop. bool handlerNotFound = true; while (handlerNotFound) { - // Ask libuwind to get next frame (skip over first which is + // Ask libunwind to get next frame (skip over first which is // _Unwind_RaiseException). int stepResult = unw_step(cursor); if (stepResult == 0) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): unw_step() reached " - "bottom => _URC_END_OF_STACK\n", + "bottom => _URC_END_OF_STACK", (void *)exception_object); return _URC_END_OF_STACK; } else if (stepResult < 0) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): unw_step failed => " - "_URC_FATAL_PHASE1_ERROR\n", + "_URC_FATAL_PHASE1_ERROR", (void *)exception_object); return _URC_FATAL_PHASE1_ERROR; } // See if frame has code to run (has personality routine). unw_proc_info_t frameInfo; unw_word_t sp; if (unw_get_proc_info(cursor, &frameInfo) != UNW_ESUCCESS) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase1(ex_ojb=%p): unw_get_proc_info " - "failed => _URC_FATAL_PHASE1_ERROR\n", + "failed => _URC_FATAL_PHASE1_ERROR", (void *)exception_object); return _URC_FATAL_PHASE1_ERROR; } // When tracing, print state information. if (_LIBUNWIND_TRACING_UNWINDING) { char functionBuf[512]; const char *functionName = functionBuf; unw_word_t offset; if ((unw_get_proc_name(cursor, functionBuf, sizeof(functionBuf), &offset) != UNW_ESUCCESS) || (frameInfo.start_ip + offset > frameInfo.end_ip)) functionName = ".anonymous."; unw_word_t pc; unw_get_reg(cursor, UNW_REG_IP, &pc); _LIBUNWIND_TRACE_UNWINDING( "unwind_phase1(ex_ojb=%p): pc=0x%" PRIx64 ", start_ip=0x%" PRIx64 - ", func=%s, lsda=0x%" PRIx64 ", personality=0x%" PRIx64 "\n", + ", func=%s, lsda=0x%" PRIx64 ", personality=0x%" PRIx64 "", (void *)exception_object, pc, frameInfo.start_ip, functionName, frameInfo.lsda, frameInfo.handler); } // If there is a personality routine, ask it if it will want to stop at // this frame. if (frameInfo.handler != 0) { __personality_routine p = (__personality_routine)(long)(frameInfo.handler); _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): calling personality function %p\n", + "unwind_phase1(ex_ojb=%p): calling personality function %p", (void *)exception_object, (void *)(uintptr_t)p); _Unwind_Reason_Code personalityResult = (*p)(1, _UA_SEARCH_PHASE, exception_object->exception_class, exception_object, (struct _Unwind_Context *)(cursor)); switch (personalityResult) { case _URC_HANDLER_FOUND: // found a catch clause or locals that need destructing in this frame // stop search and remember stack pointer at the frame handlerNotFound = false; unw_get_reg(cursor, UNW_REG_SP, &sp); exception_object->private_2 = (uintptr_t)sp; _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_HANDLER_FOUND \n", + "unwind_phase1(ex_ojb=%p): _URC_HANDLER_FOUND", (void *)exception_object); return _URC_NO_REASON; case _URC_CONTINUE_UNWIND: _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_CONTINUE_UNWIND\n", + "unwind_phase1(ex_ojb=%p): _URC_CONTINUE_UNWIND", (void *)exception_object); // continue unwinding break; default: // something went wrong _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR\n", + "unwind_phase1(ex_ojb=%p): _URC_FATAL_PHASE1_ERROR", (void *)exception_object); return _URC_FATAL_PHASE1_ERROR; } } } return _URC_NO_REASON; } static _Unwind_Reason_Code unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { unw_init_local(cursor, uc); - _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)\n", + _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p)", (void *)exception_object); // Walk each frame until we reach where search phase said to stop. while (true) { - // Ask libuwind to get next frame (skip over first which is + // Ask libunwind to get next frame (skip over first which is // _Unwind_RaiseException). int stepResult = unw_step(cursor); if (stepResult == 0) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_step() reached " - "bottom => _URC_END_OF_STACK\n", + "bottom => _URC_END_OF_STACK", (void *)exception_object); return _URC_END_OF_STACK; } else if (stepResult < 0) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_step failed => " - "_URC_FATAL_PHASE1_ERROR\n", + "_URC_FATAL_PHASE1_ERROR", (void *)exception_object); return _URC_FATAL_PHASE2_ERROR; } // Get info about this frame. unw_word_t sp; unw_proc_info_t frameInfo; unw_get_reg(cursor, UNW_REG_SP, &sp); if (unw_get_proc_info(cursor, &frameInfo) != UNW_ESUCCESS) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): unw_get_proc_info " - "failed => _URC_FATAL_PHASE1_ERROR\n", + "failed => _URC_FATAL_PHASE1_ERROR", (void *)exception_object); return _URC_FATAL_PHASE2_ERROR; } // When tracing, print state information. if (_LIBUNWIND_TRACING_UNWINDING) { char functionBuf[512]; const char *functionName = functionBuf; unw_word_t offset; if ((unw_get_proc_name(cursor, functionBuf, sizeof(functionBuf), &offset) != UNW_ESUCCESS) || (frameInfo.start_ip + offset > frameInfo.end_ip)) functionName = ".anonymous."; _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): start_ip=0x%" PRIx64 ", func=%s, sp=0x%" PRIx64 ", lsda=0x%" PRIx64 - ", personality=0x%" PRIx64 "\n", + ", personality=0x%" PRIx64, (void *)exception_object, frameInfo.start_ip, functionName, sp, frameInfo.lsda, frameInfo.handler); } // If there is a personality routine, tell it we are unwinding. if (frameInfo.handler != 0) { __personality_routine p = (__personality_routine)(long)(frameInfo.handler); _Unwind_Action action = _UA_CLEANUP_PHASE; if (sp == exception_object->private_2) { // Tell personality this was the frame it marked in phase 1. action = (_Unwind_Action)(_UA_CLEANUP_PHASE | _UA_HANDLER_FRAME); } _Unwind_Reason_Code personalityResult = (*p)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)(cursor)); switch (personalityResult) { case _URC_CONTINUE_UNWIND: // Continue unwinding _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND\n", + "unwind_phase2(ex_ojb=%p): _URC_CONTINUE_UNWIND", (void *)exception_object); if (sp == exception_object->private_2) { // Phase 1 said we would stop at this frame, but we did not... _LIBUNWIND_ABORT("during phase1 personality function said it would " "stop here, but now in phase2 it did not stop here"); } break; case _URC_INSTALL_CONTEXT: _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2(ex_ojb=%p): _URC_INSTALL_CONTEXT\n", + "unwind_phase2(ex_ojb=%p): _URC_INSTALL_CONTEXT", (void *)exception_object); // Personality routine says to transfer control to landing pad. // We may get control back if landing pad calls _Unwind_Resume(). if (_LIBUNWIND_TRACING_UNWINDING) { unw_word_t pc; unw_get_reg(cursor, UNW_REG_IP, &pc); unw_get_reg(cursor, UNW_REG_SP, &sp); _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_ojb=%p): re-entering " "user code with ip=0x%" PRIx64 - ", sp=0x%" PRIx64 "\n", + ", sp=0x%" PRIx64, (void *)exception_object, pc, sp); } unw_resume(cursor); // unw_resume() only returns if there was an error. return _URC_FATAL_PHASE2_ERROR; default: // Personality routine returned an unknown result code. _LIBUNWIND_DEBUG_LOG("personality function returned unknown result %d", personalityResult); return _URC_FATAL_PHASE2_ERROR; } } } // Clean up phase did not resume at the frame that the search phase // said it would... return _URC_FATAL_PHASE2_ERROR; } static _Unwind_Reason_Code unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop, void *stop_parameter) { unw_init_local(cursor, uc); // Walk each frame until we reach where search phase said to stop while (unw_step(cursor) > 0) { // Update info about this frame. unw_proc_info_t frameInfo; if (unw_get_proc_info(cursor, &frameInfo) != UNW_ESUCCESS) { _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): unw_step " - "failed => _URC_END_OF_STACK\n", + "failed => _URC_END_OF_STACK", (void *)exception_object); return _URC_FATAL_PHASE2_ERROR; } // When tracing, print state information. if (_LIBUNWIND_TRACING_UNWINDING) { char functionBuf[512]; const char *functionName = functionBuf; unw_word_t offset; if ((unw_get_proc_name(cursor, functionBuf, sizeof(functionBuf), &offset) != UNW_ESUCCESS) || (frameInfo.start_ip + offset > frameInfo.end_ip)) functionName = ".anonymous."; _LIBUNWIND_TRACE_UNWINDING( "unwind_phase2_forced(ex_ojb=%p): start_ip=0x%" PRIx64 - ", func=%s, lsda=0x%" PRIx64 ", personality=0x%" PRIx64 "\n", + ", func=%s, lsda=0x%" PRIx64 ", personality=0x%" PRIx64, (void *)exception_object, frameInfo.start_ip, functionName, frameInfo.lsda, frameInfo.handler); } // Call stop function at each frame. _Unwind_Action action = (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE); _Unwind_Reason_Code stopResult = (*stop)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)(cursor), stop_parameter); _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2_forced(ex_ojb=%p): stop function returned %d\n", + "unwind_phase2_forced(ex_ojb=%p): stop function returned %d", (void *)exception_object, stopResult); if (stopResult != _URC_NO_REASON) { _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2_forced(ex_ojb=%p): stopped by stop function\n", + "unwind_phase2_forced(ex_ojb=%p): stopped by stop function", (void *)exception_object); return _URC_FATAL_PHASE2_ERROR; } // If there is a personality routine, tell it we are unwinding. if (frameInfo.handler != 0) { __personality_routine p = (__personality_routine)(long)(frameInfo.handler); _LIBUNWIND_TRACE_UNWINDING( - "unwind_phase2_forced(ex_ojb=%p): calling personality function %p\n", + "unwind_phase2_forced(ex_ojb=%p): calling personality function %p", (void *)exception_object, (void *)(uintptr_t)p); _Unwind_Reason_Code personalityResult = (*p)(1, action, exception_object->exception_class, exception_object, (struct _Unwind_Context *)(cursor)); switch (personalityResult) { case _URC_CONTINUE_UNWIND: _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " "personality returned " - "_URC_CONTINUE_UNWIND\n", + "_URC_CONTINUE_UNWIND", (void *)exception_object); // Destructors called, continue unwinding break; case _URC_INSTALL_CONTEXT: _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " "personality returned " - "_URC_INSTALL_CONTEXT\n", + "_URC_INSTALL_CONTEXT", (void *)exception_object); // We may get control back if landing pad calls _Unwind_Resume(). unw_resume(cursor); break; default: // Personality routine returned an unknown result code. _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): " "personality returned %d, " - "_URC_FATAL_PHASE2_ERROR\n", + "_URC_FATAL_PHASE2_ERROR", (void *)exception_object, personalityResult); return _URC_FATAL_PHASE2_ERROR; } } } // Call stop function one last time and tell it we've reached the end // of the stack. _LIBUNWIND_TRACE_UNWINDING("unwind_phase2_forced(ex_ojb=%p): calling stop " - "function with _UA_END_OF_STACK\n", + "function with _UA_END_OF_STACK", (void *)exception_object); _Unwind_Action lastAction = (_Unwind_Action)(_UA_FORCE_UNWIND | _UA_CLEANUP_PHASE | _UA_END_OF_STACK); (*stop)(1, lastAction, exception_object->exception_class, exception_object, (struct _Unwind_Context *)(cursor), stop_parameter); // Clean up phase did not resume at the frame that the search phase said it // would. return _URC_FATAL_PHASE2_ERROR; } /// Called by __cxa_throw. Only returns if there is a fatal error. _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_RaiseException(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_RaiseException(ex_obj=%p)", (void *)exception_object); unw_context_t uc; unw_cursor_t cursor; unw_getcontext(&uc); // Mark that this is a non-forced unwind, so _Unwind_Resume() // can do the right thing. exception_object->private_1 = 0; exception_object->private_2 = 0; // phase 1: the search phase _Unwind_Reason_Code phase1 = unwind_phase1(&uc, &cursor, exception_object); if (phase1 != _URC_NO_REASON) return phase1; // phase 2: the clean up phase return unwind_phase2(&uc, &cursor, exception_object); } /// When _Unwind_RaiseException() is in phase2, it hands control /// to the personality function at each frame. The personality /// may force a jump to a landing pad in that function, the landing /// pad code may then call _Unwind_Resume() to continue with the /// unwinding. Note: the call to _Unwind_Resume() is from compiler /// geneated user code. All other _Unwind_* routines are called /// by the C++ runtime __cxa_* routines. /// /// Note: re-throwing an exception (as opposed to continuing the unwind) /// is implemented by having the code call __cxa_rethrow() which /// in turn calls _Unwind_Resume_or_Rethrow(). _LIBUNWIND_EXPORT void _Unwind_Resume(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_Resume(ex_obj=%p)\n", (void *)exception_object); + _LIBUNWIND_TRACE_API("_Unwind_Resume(ex_obj=%p)", (void *)exception_object); unw_context_t uc; unw_cursor_t cursor; unw_getcontext(&uc); if (exception_object->private_1 != 0) unwind_phase2_forced(&uc, &cursor, exception_object, (_Unwind_Stop_Fn) exception_object->private_1, (void *)exception_object->private_2); else unwind_phase2(&uc, &cursor, exception_object); // Clients assume _Unwind_Resume() does not return, so all we can do is abort. _LIBUNWIND_ABORT("_Unwind_Resume() can't return"); } /// Not used by C++. /// Unwinds stack, calling "stop" function at each frame. /// Could be used to implement longjmp(). _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *exception_object, _Unwind_Stop_Fn stop, void *stop_parameter) { - _LIBUNWIND_TRACE_API("_Unwind_ForcedUnwind(ex_obj=%p, stop=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_ForcedUnwind(ex_obj=%p, stop=%p)", (void *)exception_object, (void *)(uintptr_t)stop); unw_context_t uc; unw_cursor_t cursor; unw_getcontext(&uc); // Mark that this is a forced unwind, so _Unwind_Resume() can do // the right thing. exception_object->private_1 = (uintptr_t) stop; exception_object->private_2 = (uintptr_t) stop_parameter; // do it return unwind_phase2_forced(&uc, &cursor, exception_object, stop, stop_parameter); } /// Called by personality handler during phase 2 to get LSDA for current frame. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_proc_info_t frameInfo; uintptr_t result = 0; if (unw_get_proc_info(cursor, &frameInfo) == UNW_ESUCCESS) result = (uintptr_t)frameInfo.lsda; _LIBUNWIND_TRACE_API( - "_Unwind_GetLanguageSpecificData(context=%p) => 0x%" PRIxPTR "\n", + "_Unwind_GetLanguageSpecificData(context=%p) => 0x%" PRIxPTR, (void *)context, result); if (result != 0) { if (*((uint8_t *)result) != 0xFF) - _LIBUNWIND_DEBUG_LOG("lsda at 0x%" PRIxPTR " does not start with 0xFF\n", + _LIBUNWIND_DEBUG_LOG("lsda at 0x%" PRIxPTR " does not start with 0xFF", result); } return result; } /// Called by personality handler during phase 2 to find the start of the /// function. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetRegionStart(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_proc_info_t frameInfo; uintptr_t result = 0; if (unw_get_proc_info(cursor, &frameInfo) == UNW_ESUCCESS) result = (uintptr_t)frameInfo.start_ip; - _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p) => 0x%" PRIxPTR "\n", + _LIBUNWIND_TRACE_API("_Unwind_GetRegionStart(context=%p) => 0x%" PRIxPTR, (void *)context, result); return result; } /// Called by personality handler during phase 2 if a foreign exception // is caught. _LIBUNWIND_EXPORT void _Unwind_DeleteException(_Unwind_Exception *exception_object) { - _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)\n", + _LIBUNWIND_TRACE_API("_Unwind_DeleteException(ex_obj=%p)", (void *)exception_object); if (exception_object->exception_cleanup != NULL) (*exception_object->exception_cleanup)(_URC_FOREIGN_EXCEPTION_CAUGHT, exception_object); } /// Called by personality handler during phase 2 to get register values. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetGR(struct _Unwind_Context *context, int index) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_word_t result; unw_get_reg(cursor, index, &result); - _LIBUNWIND_TRACE_API("_Unwind_GetGR(context=%p, reg=%d) => 0x%" PRIx64 "\n", + _LIBUNWIND_TRACE_API("_Unwind_GetGR(context=%p, reg=%d) => 0x%" PRIx64, (void *)context, index, (uint64_t)result); return (uintptr_t)result; } /// Called by personality handler during phase 2 to alter register values. _LIBUNWIND_EXPORT void _Unwind_SetGR(struct _Unwind_Context *context, int index, uintptr_t value) { _LIBUNWIND_TRACE_API("_Unwind_SetGR(context=%p, reg=%d, value=0x%0" PRIx64 - ")\n", + ")", (void *)context, index, (uint64_t)value); unw_cursor_t *cursor = (unw_cursor_t *)context; unw_set_reg(cursor, index, value); } /// Called by personality handler during phase 2 to get instruction pointer. _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) { unw_cursor_t *cursor = (unw_cursor_t *)context; unw_word_t result; unw_get_reg(cursor, UNW_REG_IP, &result); - _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%" PRIx64 "\n", + _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%" PRIx64, (void *)context, (uint64_t)result); return (uintptr_t)result; } /// Called by personality handler during phase 2 to alter instruction pointer, /// such as setting where the landing pad is, so _Unwind_Resume() will /// start executing in the landing pad. _LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *context, uintptr_t value) { - _LIBUNWIND_TRACE_API("_Unwind_SetIP(context=%p, value=0x%0" PRIx64 ")\n", + _LIBUNWIND_TRACE_API("_Unwind_SetIP(context=%p, value=0x%0" PRIx64 ")", (void *)context, (uint64_t)value); unw_cursor_t *cursor = (unw_cursor_t *)context; unw_set_reg(cursor, UNW_REG_IP, value); } #endif // !_LIBUNWIND_ARM_EHABI Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/config.h =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/config.h (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/config.h (revision 308054) @@ -1,144 +1,144 @@ //===----------------------------- config.h -------------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // -// Defines macros used within libuwind project. +// Defines macros used within libunwind project. // //===----------------------------------------------------------------------===// #ifndef LIBUNWIND_CONFIG_H #define LIBUNWIND_CONFIG_H #include #include #include // Define static_assert() unless already defined by compiler. #ifndef __has_feature #define __has_feature(__x) 0 #endif #if !(__has_feature(cxx_static_assert)) && !defined(static_assert) #define static_assert(__b, __m) \ extern int compile_time_assert_failed[ ( __b ) ? 1 : -1 ] \ __attribute__( ( unused ) ); #endif // Platform specific configuration defines. #ifdef __APPLE__ #if defined(FOR_DYLD) #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 0 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 0 #else #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 0 #endif #else #if defined(__ARM_DWARF_EH__) || !defined(__arm__) #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 0 #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1 #else #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 0 #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 0 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 0 #endif #endif // FIXME: these macros are not correct for COFF targets #define _LIBUNWIND_EXPORT __attribute__((visibility("default"))) #define _LIBUNWIND_HIDDEN __attribute__((visibility("hidden"))) #if (defined(__APPLE__) && defined(__arm__)) || defined(__USING_SJLJ_EXCEPTIONS__) #define _LIBUNWIND_BUILD_SJLJ_APIS 1 #else #define _LIBUNWIND_BUILD_SJLJ_APIS 0 #endif #if defined(__i386__) || defined(__x86_64__) #define _LIBUNWIND_SUPPORT_FRAME_APIS 1 #else #define _LIBUNWIND_SUPPORT_FRAME_APIS 0 #endif #if defined(__i386__) || defined(__x86_64__) || \ (!defined(__APPLE__) && defined(__arm__)) || \ (defined(__arm64__) || defined(__aarch64__)) || \ (defined(__APPLE__) && defined(__mips__)) || \ defined(__riscv__) #define _LIBUNWIND_BUILD_ZERO_COST_APIS 1 #else #define _LIBUNWIND_BUILD_ZERO_COST_APIS 0 #endif #define _LIBUNWIND_ABORT(msg) \ do { \ fprintf(stderr, "libunwind: %s %s:%d - %s\n", __func__, __FILE__, \ __LINE__, msg); \ fflush(stderr); \ abort(); \ } while (0) -#define _LIBUNWIND_LOG(msg, ...) fprintf(stderr, "libuwind: " msg, __VA_ARGS__) +#define _LIBUNWIND_LOG(msg, ...) fprintf(stderr, "libunwind: " msg "\n", __VA_ARGS__) // Macros that define away in non-Debug builds #ifdef NDEBUG #define _LIBUNWIND_DEBUG_LOG(msg, ...) #define _LIBUNWIND_TRACE_API(msg, ...) #define _LIBUNWIND_TRACING_UNWINDING 0 #define _LIBUNWIND_TRACE_UNWINDING(msg, ...) #define _LIBUNWIND_LOG_NON_ZERO(x) x #else #ifdef __cplusplus extern "C" { #endif extern bool logAPIs(); extern bool logUnwinding(); #ifdef __cplusplus } #endif #define _LIBUNWIND_DEBUG_LOG(msg, ...) _LIBUNWIND_LOG(msg, __VA_ARGS__) #define _LIBUNWIND_LOG_NON_ZERO(x) \ do { \ int _err = x; \ if ( _err != 0 ) \ _LIBUNWIND_LOG("" #x "=%d in %s", _err, __FUNCTION__); \ } while (0) #define _LIBUNWIND_TRACE_API(msg, ...) \ do { \ if ( logAPIs() ) _LIBUNWIND_LOG(msg, __VA_ARGS__); \ } while(0) #define _LIBUNWIND_TRACE_UNWINDING(msg, ...) \ do { \ if ( logUnwinding() ) _LIBUNWIND_LOG(msg, __VA_ARGS__); \ } while(0) #define _LIBUNWIND_TRACING_UNWINDING logUnwinding() #endif #ifdef __cplusplus // Used to fit UnwindCursor and Registers_xxx types against unw_context_t / // unw_cursor_t sized memory blocks. #if defined(_LIBUNWIND_IS_NATIVE_ONLY) # define COMP_OP == #else # define COMP_OP < #endif template struct check_fit { template struct blk_count { static const size_t count = (sizeof(T) + sizeof(uint64_t) - 1) / sizeof(uint64_t); }; static const bool does_fit = (blk_count<_Type>::count COMP_OP blk_count<_Mem>::count); }; #undef COMP_OP #endif // __cplusplus #endif // LIBUNWIND_CONFIG_H Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/libunwind.cpp =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/libunwind.cpp (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind/src/libunwind.cpp (revision 308054) @@ -1,377 +1,377 @@ -//===--------------------------- libuwind.cpp -----------------------------===// +//===--------------------------- libunwind.cpp ----------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // // // Implements unw_* functions from // //===----------------------------------------------------------------------===// #include #ifndef NDEBUG #include // getenv #endif #include #include #include "libunwind_ext.h" #include "config.h" #include #include "UnwindCursor.hpp" using namespace libunwind; /// internal object to represent this processes address space LocalAddressSpace LocalAddressSpace::sThisAddressSpace; _LIBUNWIND_EXPORT unw_addr_space_t unw_local_addr_space = (unw_addr_space_t)&LocalAddressSpace::sThisAddressSpace; /// record the registers and stack position of the caller extern int unw_getcontext(unw_context_t *); // note: unw_getcontext() implemented in assembly /// Create a cursor of a thread in this process given 'context' recorded by /// unw_getcontext(). _LIBUNWIND_EXPORT int unw_init_local(unw_cursor_t *cursor, unw_context_t *context) { - _LIBUNWIND_TRACE_API("unw_init_local(cursor=%p, context=%p)\n", + _LIBUNWIND_TRACE_API("unw_init_local(cursor=%p, context=%p)", static_cast(cursor), static_cast(context)); #if defined(__i386__) # define REGISTER_KIND Registers_x86 #elif defined(__x86_64__) # define REGISTER_KIND Registers_x86_64 #elif defined(__ppc__) # define REGISTER_KIND Registers_ppc #elif defined(__aarch64__) # define REGISTER_KIND Registers_arm64 #elif _LIBUNWIND_ARM_EHABI # define REGISTER_KIND Registers_arm #elif defined(__or1k__) # define REGISTER_KIND Registers_or1k #elif defined(__riscv__) # define REGISTER_KIND Registers_riscv #elif defined(__mips__) # warning The MIPS architecture is not supported. #else # error Architecture not supported #endif // Use "placement new" to allocate UnwindCursor in the cursor buffer. new ((void *)cursor) UnwindCursor( context, LocalAddressSpace::sThisAddressSpace); #undef REGISTER_KIND AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; co->setInfoBasedOnIPRegister(); return UNW_ESUCCESS; } #ifdef UNW_REMOTE /// Create a cursor into a thread in another process. _LIBUNWIND_EXPORT int unw_init_remote_thread(unw_cursor_t *cursor, unw_addr_space_t as, void *arg) { // special case: unw_init_remote(xx, unw_local_addr_space, xx) if (as == (unw_addr_space_t)&LocalAddressSpace::sThisAddressSpace) return unw_init_local(cursor, NULL); //FIXME // use "placement new" to allocate UnwindCursor in the cursor buffer switch (as->cpuType) { case CPU_TYPE_I386: new ((void *)cursor) UnwindCursor >, Registers_x86>(((unw_addr_space_i386 *)as)->oas, arg); break; case CPU_TYPE_X86_64: new ((void *)cursor) UnwindCursor< OtherAddressSpace >, Registers_x86_64>( ((unw_addr_space_x86_64 *)as)->oas, arg); break; case CPU_TYPE_POWERPC: new ((void *)cursor) UnwindCursor >, Registers_ppc>( ((unw_addr_space_ppc *)as)->oas, arg); break; default: return UNW_EUNSPEC; } return UNW_ESUCCESS; } static bool is64bit(task_t task) { return false; // FIXME } /// Create an address_space object for use in examining another task. _LIBUNWIND_EXPORT unw_addr_space_t unw_create_addr_space_for_task(task_t task) { #if __i386__ if (is64bit(task)) { unw_addr_space_x86_64 *as = new unw_addr_space_x86_64(task); as->taskPort = task; as->cpuType = CPU_TYPE_X86_64; //as->oas } else { unw_addr_space_i386 *as = new unw_addr_space_i386(task); as->taskPort = task; as->cpuType = CPU_TYPE_I386; //as->oas } #else // FIXME #endif } /// Delete an address_space object. _LIBUNWIND_EXPORT void unw_destroy_addr_space(unw_addr_space_t asp) { switch (asp->cpuType) { #if __i386__ || __x86_64__ case CPU_TYPE_I386: { unw_addr_space_i386 *as = (unw_addr_space_i386 *)asp; delete as; } break; case CPU_TYPE_X86_64: { unw_addr_space_x86_64 *as = (unw_addr_space_x86_64 *)asp; delete as; } break; #endif case CPU_TYPE_POWERPC: { unw_addr_space_ppc *as = (unw_addr_space_ppc *)asp; delete as; } break; } } #endif // UNW_REMOTE /// Get value of specified register at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_get_reg(unw_cursor_t *cursor, unw_regnum_t regNum, unw_word_t *value) { - _LIBUNWIND_TRACE_API("unw_get_reg(cursor=%p, regNum=%d, &value=%p)\n", + _LIBUNWIND_TRACE_API("unw_get_reg(cursor=%p, regNum=%d, &value=%p)", static_cast(cursor), regNum, static_cast(value)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; if (co->validReg(regNum)) { *value = co->getReg(regNum); return UNW_ESUCCESS; } return UNW_EBADREG; } /// Set value of specified register at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_set_reg(unw_cursor_t *cursor, unw_regnum_t regNum, unw_word_t value) { - _LIBUNWIND_TRACE_API("unw_set_reg(cursor=%p, regNum=%d, value=0x%llX)\n", + _LIBUNWIND_TRACE_API("unw_set_reg(cursor=%p, regNum=%d, value=0x%llX)", static_cast(cursor), regNum, (long long)value); typedef LocalAddressSpace::pint_t pint_t; AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; if (co->validReg(regNum)) { co->setReg(regNum, (pint_t)value); // specical case altering IP to re-find info (being called by personality // function) if (regNum == UNW_REG_IP) co->setInfoBasedOnIPRegister(false); return UNW_ESUCCESS; } return UNW_EBADREG; } /// Get value of specified float register at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_get_fpreg(unw_cursor_t *cursor, unw_regnum_t regNum, unw_fpreg_t *value) { - _LIBUNWIND_TRACE_API("unw_get_fpreg(cursor=%p, regNum=%d, &value=%p)\n", + _LIBUNWIND_TRACE_API("unw_get_fpreg(cursor=%p, regNum=%d, &value=%p)", static_cast(cursor), regNum, static_cast(value)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; if (co->validFloatReg(regNum)) { *value = co->getFloatReg(regNum); return UNW_ESUCCESS; } return UNW_EBADREG; } /// Set value of specified float register at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_set_fpreg(unw_cursor_t *cursor, unw_regnum_t regNum, unw_fpreg_t value) { #if _LIBUNWIND_ARM_EHABI - _LIBUNWIND_TRACE_API("unw_set_fpreg(cursor=%p, regNum=%d, value=%llX)\n", + _LIBUNWIND_TRACE_API("unw_set_fpreg(cursor=%p, regNum=%d, value=%llX)", static_cast(cursor), regNum, value); #else - _LIBUNWIND_TRACE_API("unw_set_fpreg(cursor=%p, regNum=%d, value=%g)\n", + _LIBUNWIND_TRACE_API("unw_set_fpreg(cursor=%p, regNum=%d, value=%g)", static_cast(cursor), regNum, value); #endif AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; if (co->validFloatReg(regNum)) { co->setFloatReg(regNum, value); return UNW_ESUCCESS; } return UNW_EBADREG; } /// Move cursor to next frame. _LIBUNWIND_EXPORT int unw_step(unw_cursor_t *cursor) { - _LIBUNWIND_TRACE_API("unw_step(cursor=%p)\n", static_cast(cursor)); + _LIBUNWIND_TRACE_API("unw_step(cursor=%p)", static_cast(cursor)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; return co->step(); } /// Get unwind info at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_get_proc_info(unw_cursor_t *cursor, unw_proc_info_t *info) { - _LIBUNWIND_TRACE_API("unw_get_proc_info(cursor=%p, &info=%p)\n", + _LIBUNWIND_TRACE_API("unw_get_proc_info(cursor=%p, &info=%p)", static_cast(cursor), static_cast(info)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; co->getInfo(info); if (info->end_ip == 0) return UNW_ENOINFO; else return UNW_ESUCCESS; } /// Resume execution at cursor position (aka longjump). _LIBUNWIND_EXPORT int unw_resume(unw_cursor_t *cursor) { - _LIBUNWIND_TRACE_API("unw_resume(cursor=%p)\n", static_cast(cursor)); + _LIBUNWIND_TRACE_API("unw_resume(cursor=%p)", static_cast(cursor)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; co->jumpto(); return UNW_EUNSPEC; } /// Get name of function at cursor position in stack frame. _LIBUNWIND_EXPORT int unw_get_proc_name(unw_cursor_t *cursor, char *buf, size_t bufLen, unw_word_t *offset) { - _LIBUNWIND_TRACE_API("unw_get_proc_name(cursor=%p, &buf=%p, bufLen=%lu)\n", + _LIBUNWIND_TRACE_API("unw_get_proc_name(cursor=%p, &buf=%p, bufLen=%lu)", static_cast(cursor), static_cast(buf), static_cast(bufLen)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; if (co->getFunctionName(buf, bufLen, offset)) return UNW_ESUCCESS; else return UNW_EUNSPEC; } /// Checks if a register is a floating-point register. _LIBUNWIND_EXPORT int unw_is_fpreg(unw_cursor_t *cursor, unw_regnum_t regNum) { - _LIBUNWIND_TRACE_API("unw_is_fpreg(cursor=%p, regNum=%d)\n", + _LIBUNWIND_TRACE_API("unw_is_fpreg(cursor=%p, regNum=%d)", static_cast(cursor), regNum); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; return co->validFloatReg(regNum); } /// Checks if a register is a floating-point register. _LIBUNWIND_EXPORT const char *unw_regname(unw_cursor_t *cursor, unw_regnum_t regNum) { - _LIBUNWIND_TRACE_API("unw_regname(cursor=%p, regNum=%d)\n", + _LIBUNWIND_TRACE_API("unw_regname(cursor=%p, regNum=%d)", static_cast(cursor), regNum); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; return co->getRegisterName(regNum); } /// Checks if current frame is signal trampoline. _LIBUNWIND_EXPORT int unw_is_signal_frame(unw_cursor_t *cursor) { - _LIBUNWIND_TRACE_API("unw_is_signal_frame(cursor=%p)\n", + _LIBUNWIND_TRACE_API("unw_is_signal_frame(cursor=%p)", static_cast(cursor)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; return co->isSignalFrame(); } #ifdef __arm__ // Save VFP registers d0-d15 using FSTMIADX instead of FSTMIADD _LIBUNWIND_EXPORT void unw_save_vfp_as_X(unw_cursor_t *cursor) { - _LIBUNWIND_TRACE_API("unw_fpreg_save_vfp_as_X(cursor=%p)\n", + _LIBUNWIND_TRACE_API("unw_fpreg_save_vfp_as_X(cursor=%p)", static_cast(cursor)); AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor; return co->saveVFPAsX(); } #endif #if _LIBUNWIND_SUPPORT_DWARF_UNWIND /// SPI: walks cached dwarf entries _LIBUNWIND_EXPORT void unw_iterate_dwarf_unwind_cache(void (*func)( unw_word_t ip_start, unw_word_t ip_end, unw_word_t fde, unw_word_t mh)) { - _LIBUNWIND_TRACE_API("unw_iterate_dwarf_unwind_cache(func=%p)\n", + _LIBUNWIND_TRACE_API("unw_iterate_dwarf_unwind_cache(func=%p)", reinterpret_cast(func)); DwarfFDECache::iterateCacheEntries(func); } /// IPI: for __register_frame() void _unw_add_dynamic_fde(unw_word_t fde) { CFI_Parser::FDE_Info fdeInfo; CFI_Parser::CIE_Info cieInfo; const char *message = CFI_Parser::decodeFDE( LocalAddressSpace::sThisAddressSpace, (LocalAddressSpace::pint_t) fde, &fdeInfo, &cieInfo); if (message == NULL) { // dynamically registered FDEs don't have a mach_header group they are in. // Use fde as mh_group unw_word_t mh_group = fdeInfo.fdeStart; DwarfFDECache::add((LocalAddressSpace::pint_t)mh_group, fdeInfo.pcStart, fdeInfo.pcEnd, fdeInfo.fdeStart); } else { _LIBUNWIND_DEBUG_LOG("_unw_add_dynamic_fde: bad fde: %s", message); } } /// IPI: for __deregister_frame() void _unw_remove_dynamic_fde(unw_word_t fde) { // fde is own mh_group DwarfFDECache::removeAllIn((LocalAddressSpace::pint_t)fde); } #endif // _LIBUNWIND_SUPPORT_DWARF_UNWIND // Add logging hooks in Debug builds only #ifndef NDEBUG #include _LIBUNWIND_HIDDEN bool logAPIs() { // do manual lock to avoid use of _cxa_guard_acquire or initializers static bool checked = false; static bool log = false; if (!checked) { log = (getenv("LIBUNWIND_PRINT_APIS") != NULL); checked = true; } return log; } _LIBUNWIND_HIDDEN bool logUnwinding() { // do manual lock to avoid use of _cxa_guard_acquire or initializers static bool checked = false; static bool log = false; if (!checked) { log = (getenv("LIBUNWIND_PRINT_UNWINDING") != NULL); checked = true; } return log; } #endif // NDEBUG Index: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind (revision 308054) Property changes on: user/alc/PQ_LAUNDRY/contrib/llvm/projects/libunwind ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm/projects/libunwind:r303642-308053 Index: user/alc/PQ_LAUNDRY/contrib/llvm =================================================================== --- user/alc/PQ_LAUNDRY/contrib/llvm (revision 308053) +++ user/alc/PQ_LAUNDRY/contrib/llvm (revision 308054) Property changes on: user/alc/PQ_LAUNDRY/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm:r305322-308053 Index: user/alc/PQ_LAUNDRY/lib/libgcc_eh/Makefile.inc =================================================================== --- user/alc/PQ_LAUNDRY/lib/libgcc_eh/Makefile.inc (revision 308053) +++ user/alc/PQ_LAUNDRY/lib/libgcc_eh/Makefile.inc (revision 308054) @@ -1,30 +1,30 @@ # $FreeBSD$ COMPILERRTDIR= ${SRCTOP}/contrib/compiler-rt UNWINDINCDIR= ${SRCTOP}/contrib/llvm/projects/libunwind/include UNWINDSRCDIR= ${SRCTOP}/contrib/llvm/projects/libunwind/src -CFLAGS+=${PICFLAG} -fvisibility=hidden -DVISIBILITY_HIDDEN +STATIC_CFLAGS+=${PICFLAG} -fvisibility=hidden -DVISIBILITY_HIDDEN .PATH: ${COMPILERRTDIR}/lib/builtins .PATH: ${UNWINDSRCDIR} SRCS+= gcc_personality_v0.c SRCS+= int_util.c SRCS+= Unwind-EHABI.cpp SRCS+= Unwind-sjlj.c SRCS+= UnwindLevel1-gcc-ext.c SRCS+= UnwindLevel1.c SRCS+= UnwindRegistersRestore.S SRCS+= UnwindRegistersSave.S SRCS+= libunwind.cpp CFLAGS+= -I${UNWINDINCDIR} -I${.CURDIR} -D_LIBUNWIND_IS_NATIVE_ONLY .if empty(CXXFLAGS:M-std=*) CXXFLAGS+= -std=c++11 .endif CXXFLAGS+= -fno-rtti STATIC_CXXFLAGS+= -fvisibility=hidden -fPIC .if ${MK_DIRDEPS_BUILD} == "yes" # Avoid dependency on lib/libc++ CFLAGS+= -I${SRCTOP}/contrib/libc++/include .endif Index: user/alc/PQ_LAUNDRY/share/mk/bsd.suffixes.mk =================================================================== --- user/alc/PQ_LAUNDRY/share/mk/bsd.suffixes.mk (revision 308053) +++ user/alc/PQ_LAUNDRY/share/mk/bsd.suffixes.mk (revision 308054) @@ -1,111 +1,111 @@ # $FreeBSD$ .sh: cp -f ${.IMPSRC} ${.TARGET} chmod a+x ${.TARGET} .c.ln: ${LINT} ${LINTOBJFLAGS} ${CFLAGS:M-[DIU]*} ${.IMPSRC} || \ touch ${.TARGET} .cc.ln .C.ln .cpp.ln .cxx.ln: ${LINT} ${LINTOBJFLAGS} ${CXXFLAGS:M-[DIU]*} ${.IMPSRC} || \ touch ${.TARGET} .c: ${CC} ${CFLAGS} ${LDFLAGS} ${.IMPSRC} ${LDLIBS} -o ${.TARGET} ${CTFCONVERT_CMD} .c.o: ${CC} ${STATIC_CFLAGS} ${CFLAGS} -c ${.IMPSRC} -o ${.TARGET} ${CTFCONVERT_CMD} -.c.bc: +.c.bco: ${CC} -emit-llvm ${IR_CFLAGS} -c ${.IMPSRC} -o ${.TARGET} -.c.ll: +.c.llo: ${CC} -emit-llvm ${IR_CFLAGS} -S ${.IMPSRC} -o ${.TARGET} .cc .cpp .cxx .C: ${CXX} ${CXXFLAGS} ${LDFLAGS} ${.IMPSRC} ${LDLIBS} -o ${.TARGET} .cc.o .cpp.o .cxx.o .C.o: ${CXX} ${STATIC_CXXFLAGS} ${CXXFLAGS} -c ${.IMPSRC} -o ${.TARGET} -.cc.bc .cpp.bc .cxx.bc .C.bc: +.cc.bco .cpp.bco .cxx.bco .C.bco: ${CXX} -emit-llvm ${IR_CXXFLAGS} -c ${.IMPSRC} -o ${.TARGET} -.cc.ll .cpp.ll .cxx.ll .C.ll: +.cc.llo .cpp.llo .cxx.llo .C.llo: ${CXX} -emit-llvm ${IR_CXXFLAGS} -S ${.IMPSRC} -o ${.TARGET} .m.o: ${OBJC} ${OBJCFLAGS} -c ${.IMPSRC} -o ${.TARGET} ${CTFCONVERT_CMD} .p.o: ${PC} ${PFLAGS} -c ${.IMPSRC} -o ${.TARGET} ${CTFCONVERT_CMD} .e .r .F .f: ${FC} ${RFLAGS} ${EFLAGS} ${FFLAGS} ${LDFLAGS} ${.IMPSRC} ${LDLIBS} \ -o ${.TARGET} .e.o .r.o .F.o .f.o: ${FC} ${RFLAGS} ${EFLAGS} ${FFLAGS} -c ${.IMPSRC} -o ${.TARGET} .S.o: ${CC:N${CCACHE_BIN}} ${CFLAGS} ${ACFLAGS} -c ${.IMPSRC} -o ${.TARGET} ${CTFCONVERT_CMD} .asm.o: ${CC:N${CCACHE_BIN}} -x assembler-with-cpp ${CFLAGS} ${ACFLAGS} -c ${.IMPSRC} \ -o ${.TARGET} ${CTFCONVERT_CMD} .s.o: ${AS} ${AFLAGS} -o ${.TARGET} ${.IMPSRC} ${CTFCONVERT_CMD} # XXX not -j safe .y.o: ${YACC} ${YFLAGS} ${.IMPSRC} ${CC} ${CFLAGS} -c y.tab.c -o ${.TARGET} rm -f y.tab.c ${CTFCONVERT_CMD} .l.o: ${LEX} -t ${LFLAGS} ${.IMPSRC} > ${.PREFIX}.tmp.c ${CC} ${CFLAGS} -c ${.PREFIX}.tmp.c -o ${.TARGET} rm -f ${.PREFIX}.tmp.c ${CTFCONVERT_CMD} # XXX not -j safe .y.c: ${YACC} ${YFLAGS} ${.IMPSRC} mv y.tab.c ${.TARGET} .l.c: ${LEX} -t ${LFLAGS} ${.IMPSRC} > ${.TARGET} .s.out .c.out .o.out: ${CC} ${CFLAGS} ${LDFLAGS} ${.IMPSRC} ${LDLIBS} -o ${.TARGET} ${CTFCONVERT_CMD} .f.out .F.out .r.out .e.out: ${FC} ${EFLAGS} ${RFLAGS} ${FFLAGS} ${LDFLAGS} ${.IMPSRC} \ ${LDLIBS} -o ${.TARGET} rm -f ${.PREFIX}.o ${CTFCONVERT_CMD} # XXX not -j safe .y.out: ${YACC} ${YFLAGS} ${.IMPSRC} ${CC} ${CFLAGS} ${LDFLAGS} y.tab.c ${LDLIBS} -ly -o ${.TARGET} rm -f y.tab.c ${CTFCONVERT_CMD} .l.out: ${LEX} -t ${LFLAGS} ${.IMPSRC} > ${.PREFIX}.tmp.c ${CC} ${CFLAGS} ${LDFLAGS} ${.PREFIX}.tmp.c ${LDLIBS} -ll -o ${.TARGET} rm -f ${.PREFIX}.tmp.c ${CTFCONVERT_CMD} Index: user/alc/PQ_LAUNDRY/share/mk/sys.mk =================================================================== --- user/alc/PQ_LAUNDRY/share/mk/sys.mk (revision 308053) +++ user/alc/PQ_LAUNDRY/share/mk/sys.mk (revision 308054) @@ -1,327 +1,327 @@ # from: @(#)sys.mk 8.2 (Berkeley) 3/21/94 # $FreeBSD$ unix ?= We run FreeBSD, not UNIX. .FreeBSD ?= true .if !defined(%POSIX) # # MACHINE_CPUARCH defines a collection of MACHINE_ARCH. Machines with # the same MACHINE_ARCH can run each other's binaries, so it necessarily # has word size and endian swizzled in. However, support files for # these machines often are shared amongst all combinations of size # and/or endian. This is called MACHINE_CPU in NetBSD, but that's used # for something different in FreeBSD. # MACHINE_CPUARCH=${MACHINE_ARCH:C/mips(n32|64)?(el)?/mips/:C/arm(v6)?(eb|hf)?/arm/:C/powerpc(64|spe)/powerpc/:C/riscv64/riscv/} .endif # Some options we need now __DEFAULT_NO_OPTIONS= \ DIRDEPS_BUILD \ DIRDEPS_CACHE __DEFAULT_DEPENDENT_OPTIONS= \ AUTO_OBJ/DIRDEPS_BUILD \ META_MODE/DIRDEPS_BUILD \ STAGING/DIRDEPS_BUILD \ SYSROOT/DIRDEPS_BUILD __ENV_ONLY_OPTIONS:= \ ${__DEFAULT_NO_OPTIONS} \ ${__DEFAULT_YES_OPTIONS} \ ${__DEFAULT_DEPENDENT_OPTIONS:H} # early include for customization # see local.sys.mk below # Not included when building in fmake compatibility mode (still needed # for older system support) .if defined(.PARSEDIR) .sinclude .include # Disable MK_META_MODE with make -B .if ${MK_META_MODE} == "yes" && defined(.MAKEFLAGS) && ${.MAKEFLAGS:M-B} MK_META_MODE= no .endif .if ${MK_DIRDEPS_BUILD} == "yes" .sinclude .elif ${MK_META_MODE} == "yes" # verbose will show .MAKE.META.PREFIX for each target. META_MODE+= meta verbose .if !defined(NO_META_MISSING) META_MODE+= missing-meta=yes .endif # silent will hide command output if a .meta file is created. .if !defined(NO_SILENT) META_MODE+= silent=yes .endif .if !exists(/dev/filemon) META_MODE+= nofilemon .endif # Require filemon data with bmake .if empty(META_MODE:Mnofilemon) META_MODE+= missing-filemon=yes .endif .endif META_MODE?= normal .export META_MODE .MAKE.MODE?= ${META_MODE} .if !empty(.MAKE.MODE:Mmeta) && !defined(NO_META_IGNORE_HOST) # Ignore host file changes that will otherwise cause # buildworld -> installworld -> buildworld to rebuild everything. # Since the build is self-reliant and bootstraps everything it needs, # this should not be a real problem for incremental builds. # XXX: This relies on the existing host tools retaining ABI compatibility # through upgrades since they won't be rebuilt on header/library changes. # Note that these are prefix matching, so /lib matches /libexec. .MAKE.META.IGNORE_PATHS+= \ ${__MAKE_SHELL} \ /bin \ /lib \ /rescue \ /sbin \ /usr/bin \ /usr/include \ /usr/lib \ /usr/sbin \ /usr/share \ .endif .if ${MK_AUTO_OBJ} == "yes" # This needs to be done early - before .PATH is computed # Don't do this for 'make showconfig' as it enables all options where meta mode # is not expected. .if !make(showconfig) && !make(print-dir) .sinclude .endif .endif .else # bmake .include .endif # If the special target .POSIX appears (without prerequisites or # commands) before the first noncomment line in the makefile, make shall # process the makefile as specified by the Posix 1003.2 specification. # make(1) sets the special macro %POSIX in this case (to the actual # value "1003.2", for what it's worth). # # The rules below use this macro to distinguish between Posix-compliant # and default behaviour. # # This functionality is currently broken, since make(1) processes sys.mk # before reading any other files, and consequently has no opportunity to # set the %POSIX macro before we read this point. .if defined(%POSIX) .SUFFIXES: .o .c .y .l .a .sh .f .else -.SUFFIXES: .out .a .ln .o .bc .ll .c .cc .cpp .cxx .C .m .F .f .e .r .y .l .S .asm .s .cl .p .h .sh +.SUFFIXES: .out .a .ln .o .bco .llo .c .cc .cpp .cxx .C .m .F .f .e .r .y .l .S .asm .s .cl .p .h .sh .endif AR ?= ar .if defined(%POSIX) ARFLAGS ?= -rv .else ARFLAGS ?= -crD .endif RANLIB ?= ranlib .if !defined(%POSIX) RANLIBFLAGS ?= -D .endif AS ?= as AFLAGS ?= ACFLAGS ?= .if defined(%POSIX) CC ?= c89 CFLAGS ?= -O .else CC ?= cc .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "mips" CFLAGS ?= -O -pipe .else CFLAGS ?= -O2 -pipe .endif .if defined(NO_STRICT_ALIASING) CFLAGS += -fno-strict-aliasing .endif .endif IR_CFLAGS ?= ${STATIC_CFLAGS:N-O*} ${CFLAGS:N-O*} PO_CFLAGS ?= ${CFLAGS} # cp(1) is used to copy source files to ${.OBJDIR}, make sure it can handle # read-only files as non-root by passing -f. CP ?= cp -f CPP ?= cpp # C Type Format data is required for DTrace CTFFLAGS ?= -L VERSION CTFCONVERT ?= ctfconvert CTFMERGE ?= ctfmerge .if defined(CFLAGS) && (${CFLAGS:M-g} != "") CTFFLAGS += -g .endif CXX ?= c++ CXXFLAGS ?= ${CFLAGS:N-std=*:N-Wnested-externs:N-W*-prototypes:N-Wno-pointer-sign:N-Wold-style-definition} IR_CXXFLAGS ?= ${STATIC_CXXFLAGS:N-O*} ${CXXFLAGS:N-O*} PO_CXXFLAGS ?= ${CXXFLAGS} DTRACE ?= dtrace DTRACEFLAGS ?= -C -x nolibs .if empty(.MAKEFLAGS:M-s) ECHO ?= echo ECHODIR ?= echo .else ECHO ?= true .if ${.MAKEFLAGS:M-s} == "-s" ECHODIR ?= echo .else ECHODIR ?= true .endif .endif .if ${.MAKEFLAGS:M-N} # bmake -N is supposed to skip executing anything but it does not skip # exeucting '+' commands. The '+' feature is used where .MAKE # is not safe for the entire target. -N is intended to skip building sub-makes # so it executing '+' commands is not right. Work around the bug by not # setting '+' when -N is used. _+_ ?= .else _+_ ?= + .endif .if defined(%POSIX) FC ?= fort77 FFLAGS ?= -O 1 .else FC ?= f77 FFLAGS ?= -O .endif EFLAGS ?= INSTALL ?= install LEX ?= lex LFLAGS ?= LD ?= ld LDFLAGS ?= # LDFLAGS is for CC, _LDFLAGS = ${LDFLAGS:S/-Wl,//g} # strip -Wl, for LD LINT ?= lint LINTFLAGS ?= -cghapbx LINTKERNFLAGS ?= ${LINTFLAGS} LINTOBJFLAGS ?= -cghapbxu -i LINTOBJKERNFLAGS?= ${LINTOBJFLAGS} LINTLIBFLAGS ?= -cghapbxu -C ${LIB} MAKE ?= make .if !defined(%POSIX) LORDER ?= lorder NM ?= nm NMFLAGS ?= OBJC ?= cc OBJCFLAGS ?= ${OBJCINCLUDES} ${CFLAGS} -Wno-import OBJCOPY ?= objcopy PC ?= pc PFLAGS ?= RC ?= f77 RFLAGS ?= TSORT ?= tsort TSORTFLAGS ?= -q .endif SHELL ?= sh .if !defined(%POSIX) SIZE ?= size .endif YACC ?= yacc .if defined(%POSIX) YFLAGS ?= .else YFLAGS ?= -d .endif .if defined(%POSIX) .include "bsd.suffixes-posix.mk" .else # non-Posix rule set .include "bsd.suffixes.mk" # Pull in global settings. __MAKE_CONF?=/etc/make.conf .if exists(${__MAKE_CONF}) .include "${__MAKE_CONF}" .endif # late include for customization .sinclude .if defined(META_MODE) META_MODE:= ${META_MODE:O:u} .endif .if defined(__MAKE_SHELL) && !empty(__MAKE_SHELL) SHELL= ${__MAKE_SHELL} .SHELL: path=${__MAKE_SHELL} .endif # Tell bmake to expand -V VAR by default .MAKE.EXPAND_VARIABLES= yes # Tell bmake the makefile preference .MAKE.MAKEFILE_PREFERENCE= BSDmakefile makefile Makefile # Tell bmake to always pass job tokens, regardless of target depending on # .MAKE or looking like ${MAKE}/${.MAKE}/$(MAKE)/$(.MAKE)/make. .MAKE.ALWAYS_PASS_JOB_QUEUE= yes # By default bmake does *not* use set -e # when running target scripts, this is a problem for many makefiles here. # So define a shell that will do what FreeBSD expects. .ifndef WITHOUT_SHELL_ERRCTL __MAKE_SHELL?=/bin/sh .SHELL: name=sh \ quiet="set -" echo="set -v" filter="set -" \ hasErrCtl=yes check="set -e" ignore="set +e" \ echoFlag=v errFlag=e \ path=${__MAKE_SHELL} .endif # Hack for ports compatibility. Historically, ports makefiles have # assumed they can examine MACHINE_CPU without including anything # because this was automatically included in sys.mk. For /usr/src, # this file has moved to being included from bsd.opts.mk. Until all # the ports files are modernized, and a reasonable transition # period has passed, include it while we're in a ports tree here # to preserve historic behavior. .if exists(${.CURDIR}/../../Mk/bsd.port.mk) .include .endif .endif # ! Posix Index: user/alc/PQ_LAUNDRY/sys/amd64/amd64/mem.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/amd64/amd64/mem.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/mem.c (revision 308054) @@ -1,238 +1,238 @@ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1982, 1986, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and code derived from software contributed to * Berkeley by William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: mem.c 1.13 89/10/08$ * from: @(#)mem.c 7.2 (Berkeley) 5/9/91 */ #include __FBSDID("$FreeBSD$"); /* * Memory special file */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Used in /dev/mem drivers and elsewhere */ MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors"); /* ARGSUSED */ int memrw(struct cdev *dev, struct uio *uio, int flags) { struct iovec *iov; void *p; ssize_t orig_resid; u_long v, vd; u_int c; int error; error = 0; orig_resid = uio->uio_resid; while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } v = uio->uio_offset; c = ulmin(iov->iov_len, PAGE_SIZE - (u_int)(v & PAGE_MASK)); switch (dev2unit(dev)) { case CDEV_MINOR_KMEM: /* * Since c is clamped to be less or equal than * PAGE_SIZE, the uiomove() call does not * access past the end of the direct map. */ if (v >= DMAP_MIN_ADDRESS && v < DMAP_MIN_ADDRESS + dmaplimit) { error = uiomove((void *)v, c, uio); break; } if (!kernacc((void *)v, c, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) { error = EFAULT; break; } /* * If the extracted address is not accessible * through the direct map, then we make a * private (uncached) mapping because we can't * depend on the existing kernel mapping * remaining valid until the completion of * uiomove(). * * XXX We cannot provide access to the * physical page 0 mapped into KVA. */ v = pmap_extract(kernel_pmap, v); if (v == 0) { error = EFAULT; break; } /* FALLTHROUGH */ case CDEV_MINOR_MEM: if (v < dmaplimit) { vd = PHYS_TO_DMAP(v); error = uiomove((void *)vd, c, uio); break; } - if (v >= (1ULL << cpu_maxphyaddr)) { + if (v > cpu_getmaxphyaddr()) { error = EFAULT; break; } p = pmap_mapdev(v, PAGE_SIZE); error = uiomove(p, c, uio); pmap_unmapdev((vm_offset_t)p, PAGE_SIZE); break; } } /* * Don't return error if any byte was written. Read and write * can return error only if no i/o was performed. */ if (uio->uio_resid != orig_resid) error = 0; return (error); } /* * allow user processes to MMAP some memory sections * instead of going through read/write */ /* ARGSUSED */ int memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int prot __unused, vm_memattr_t *memattr __unused) { if (dev2unit(dev) == CDEV_MINOR_MEM) { - if (offset >= (1ULL << cpu_maxphyaddr)) + if (offset > cpu_getmaxphyaddr()) return (-1); *paddr = offset; return (0); } return (-1); } /* * Operations for changing memory attributes. * * This is basically just an ioctl shim for mem_range_attr_get * and mem_range_attr_set. */ /* ARGSUSED */ int memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags, struct thread *td) { int nd, error = 0; struct mem_range_op *mo = (struct mem_range_op *)data; struct mem_range_desc *md; /* is this for us? */ if ((cmd != MEMRANGE_GET) && (cmd != MEMRANGE_SET)) return (ENOTTY); /* any chance we can handle this? */ if (mem_range_softc.mr_op == NULL) return (EOPNOTSUPP); /* do we have any descriptors? */ if (mem_range_softc.mr_ndesc == 0) return (ENXIO); switch (cmd) { case MEMRANGE_GET: nd = imin(mo->mo_arg[0], mem_range_softc.mr_ndesc); if (nd > 0) { md = (struct mem_range_desc *) malloc(nd * sizeof(struct mem_range_desc), M_MEMDESC, M_WAITOK); error = mem_range_attr_get(md, &nd); if (!error) error = copyout(md, mo->mo_desc, nd * sizeof(struct mem_range_desc)); free(md, M_MEMDESC); } else nd = mem_range_softc.mr_ndesc; mo->mo_arg[0] = nd; break; case MEMRANGE_SET: md = (struct mem_range_desc *)malloc(sizeof(struct mem_range_desc), M_MEMDESC, M_WAITOK); error = copyin(mo->mo_desc, md, sizeof(struct mem_range_desc)); /* clamp description string */ md->mr_owner[sizeof(md->mr_owner) - 1] = 0; if (error == 0) error = mem_range_attr_set(md, &mo->mo_arg[0]); free(md, M_MEMDESC); break; } return (error); } Index: user/alc/PQ_LAUNDRY/sys/amd64/vmm/amd/svm.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/amd64/vmm/amd/svm.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/amd64/vmm/amd/svm.c (revision 308054) @@ -1,2247 +1,2246 @@ /*- * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_ktr.h" #include "vmm_ioport.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "x86.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); /* * SVM CPUID function 0x8000_000A, edx bit decoding. */ #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ VMCB_CACHE_TPR | \ VMCB_CACHE_CR2 | \ VMCB_CACHE_CR | \ VMCB_CACHE_DT | \ VMCB_CACHE_SEG | \ VMCB_CACHE_NP) static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 0, NULL); static MALLOC_DEFINE(M_SVM, "svm", "svm"); static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); /* Per-CPU context area. */ extern struct pcpu __pcpu[]; static uint32_t svm_feature = ~0U; /* AMD SVM features. */ SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, "SVM features advertised by CPUID.8000000AH:EDX"); static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, "Number of ASIDs supported by this processor"); /* Current ASID generation for each host cpu */ static struct asid asid[MAXCPU]; /* * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); static __inline int flush_by_asid(void) { return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); } static __inline int decode_assist(void) { return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } static void svm_disable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer &= ~EFER_SVM; wrmsr(MSR_EFER, efer); } /* * Disable SVM on all CPUs. */ static int svm_cleanup(void) { smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } /* * Verify that all the features required by bhyve are available. */ static int check_svm_features(void) { u_int regs[4]; /* CPUID Fn8000_000A is for SVM */ do_cpuid(0x8000000A, regs); svm_feature &= regs[3]; /* * The number of ASIDs can be configured to be less than what is * supported by the hardware but not more. */ if (nasid == 0 || nasid > regs[1]) nasid = regs[1]; KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); /* bhyve requires the Nested Paging feature */ if (!(svm_feature & AMD_CPUID_SVM_NP)) { printf("SVM: Nested Paging feature not available.\n"); return (ENXIO); } /* bhyve requires the NRIP Save feature */ if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { printf("SVM: NRIP Save feature not available.\n"); return (ENXIO); } return (0); } static void svm_enable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer |= EFER_SVM; wrmsr(MSR_EFER, efer); wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); } /* * Return 1 if SVM is enabled on this processor and 0 otherwise. */ static int svm_available(void) { uint64_t msr; /* Section 15.4 Enabling SVM from APM2. */ if ((amd_feature2 & AMDID2_SVM) == 0) { printf("SVM: not available.\n"); return (0); } msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) != 0) { printf("SVM: disabled by BIOS.\n"); return (0); } return (1); } static int svm_init(int ipinum) { int error, cpu; if (!svm_available()) return (ENXIO); error = check_svm_features(); if (error) return (error); vmcb_clean &= VMCB_CACHE_DEFAULT; for (cpu = 0; cpu < MAXCPU; cpu++) { /* * Initialize the host ASIDs to their "highest" valid values. * * The next ASID allocation will rollover both 'gen' and 'num' * and start off the sequence at {1,1}. */ asid[cpu].gen = ~0UL; asid[cpu].num = nasid - 1; } svm_msr_init(); svm_npt_init(ipinum); /* Enable SVM on all CPUs */ smp_rendezvous(NULL, svm_enable, NULL, NULL); return (0); } static void svm_restore(void) { svm_enable(NULL); } /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000UL #define MSR_AMD6TH_END 0xC0001FFFUL /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. */ static int svm_msr_index(uint64_t msr, int *index, int *bit) { uint32_t base, off; *index = -1; *bit = (msr % 4) * 2; base = 0; if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { *index = msr / 4; return (0); } base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { off = (msr - MSR_AMD6TH_START); *index = (off + base) / 4; return (0); } base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { off = (msr - MSR_AMD7TH_START); *index = (off + base) / 4; return (0); } return (EINVAL); } /* * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. */ static void svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) { int index, bit, error; error = svm_msr_index(msr, &index, &bit); KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, ("%s: invalid index %d for msr %#lx", __func__, index, msr)); KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " "msr %#lx", __func__, bit, msr)); if (read) perm_bitmap[index] &= ~(1UL << bit); if (write) perm_bitmap[index] &= ~(2UL << bit); } static void svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, true); } static void svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, false); } static __inline int svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) { struct vmcb_ctrl *ctrl; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); return (ctrl->intercept[idx] & bitmask ? 1 : 0); } static __inline void svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, int enabled) { struct vmcb_ctrl *ctrl; uint32_t oldval; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intercept[idx]; if (enabled) ctrl->intercept[idx] |= bitmask; else ctrl->intercept[idx] &= ~bitmask; if (ctrl->intercept[idx] != oldval) { svm_set_dirty(sc, vcpu, VMCB_CACHE_I); VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); } } static __inline void svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 0); } static __inline void svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 1); } static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; uint32_t mask; int n; ctrl = svm_get_vmcb_ctrl(sc, vcpu); state = svm_get_vmcb_state(sc, vcpu); ctrl->iopm_base_pa = iopm_base_pa; ctrl->msrpm_base_pa = msrpm_base_pa; /* Enable nested paging */ ctrl->np_enable = 1; ctrl->n_cr3 = np_pml4; /* * Intercept accesses to the control registers that are not shadowed * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. */ for (n = 0; n < 16; n++) { mask = (BIT(n) << 16) | BIT(n); if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); else svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ if (vcpu_trace_exceptions(sc->vm, vcpu)) { for (n = 0; n < 32; n++) { /* * Skip unimplemented vectors in the exception bitmap. */ if (n == 2 || n == 9) { continue; } svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); } } else { svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); } /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. */ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); /* * The ASID will be set to a non-zero value just before VMRUN. */ ctrl->asid = 0; /* * Section 15.21.1, Interrupt Masking in EFLAGS * Section 15.21.2, Virtualizing APIC.TPR * * This must be set for %rflag and %cr8 isolation of guest and host. */ ctrl->v_intr_masking = 1; /* Enable Last Branch Record aka LBR for debugging */ ctrl->lbr_virt_en = 1; state->dbgctl = BIT(0); /* EFER_SVM must always be set when the guest is executing */ state->efer = EFER_SVM; /* Set up the PAT to power-on state */ state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); } /* * Initialize a virtual machine. */ static void * svm_vminit(struct vm *vm, pmap_t pmap) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; - vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; int i; svm_sc = contigmalloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO, - 0, VM_MAX_ADDRESS, PAGE_SIZE, 0); + 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); svm_sc->vm = vm; svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); /* * Intercept read and write accesses to all MSRs. */ memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap)); /* * Access to the following MSRs is redirected to the VMCB when the * guest is executing. Therefore it is safe to allow the guest to * read/write these MSRs directly without hypervisor involvement. */ svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); - + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); /* * Intercept writes to make sure that the EFER_SVM bit is not cleared. */ svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); /* Intercept access to all I/O ports. */ memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap)); iopm_pa = vtophys(svm_sc->iopm_bitmap); msrpm_pa = vtophys(svm_sc->msr_bitmap); pml4_pa = svm_sc->nptp; for (i = 0; i < VM_MAXCPU; i++) { vcpu = svm_get_vcpu(svm_sc, i); vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); svm_msr_guest_init(svm_sc, i); } return (svm_sc); } /* * Collateral for a generic SVM VM-exit. */ static void vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) { vme->exitcode = VM_EXITCODE_SVM; vme->u.svm.exitcode = code; vme->u.svm.exitinfo1 = info1; vme->u.svm.exitinfo2 = info2; } static int svm_cpl(struct vmcb_state *state) { /* * From APMv2: * "Retrieve the CPL from the CPL field in the VMCB, not * from any segment DPL" */ return (state->cpl); } static enum vm_cpu_mode svm_vcpu_mode(struct vmcb *vmcb) { struct vmcb_segment seg; struct vmcb_state *state; int error; state = &vmcb->state; if (state->efer & EFER_LMA) { error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, error)); /* * Section 4.8.1 for APM2, check if Code Segment has * Long attribute set in descriptor. */ if (seg.attrib & VMCB_CS_ATTRIB_L) return (CPU_MODE_64BIT); else return (CPU_MODE_COMPATIBILITY); } else if (state->cr0 & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) { if ((cr0 & CR0_PG) == 0) return (PAGING_MODE_FLAT); if ((cr4 & CR4_PAE) == 0) return (PAGING_MODE_32); if (efer & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } /* * ins/outs utility routines */ static uint64_t svm_inout_str_index(struct svm_regctx *regs, int in) { uint64_t val; val = in ? regs->sctx_rdi : regs->sctx_rsi; return (val); } static uint64_t svm_inout_str_count(struct svm_regctx *regs, int rep) { uint64_t val; val = rep ? regs->sctx_rcx : 1; return (val); } static void svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { /* The segment field has standard encoding */ s = (info1 >> 10) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); } static int svm_inout_str_addrsize(uint64_t info1) { uint32_t size; size = (info1 >> 7) & 0x7; switch (size) { case 1: return (2); /* 16 bit */ case 2: return (4); /* 32 bit */ case 4: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) { struct vmcb_state *state; state = &vmcb->state; paging->cr3 = state->cr3; paging->cpl = svm_cpl(state); paging->cpu_mode = svm_vcpu_mode(vmcb); paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, state->efer); } #define UNHANDLED 0 /* * Handle guest I/O intercept. */ static int svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_regctx *regs; struct vm_inout_str *vis; uint64_t info1; int inout_string; state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); regs = svm_get_guest_regctx(svm_sc, vcpu); info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; /* * The effective segment number in EXITINFO1[12:10] is populated * only if the processor has the DecodeAssist capability. * * XXX this is not specified explicitly in APMv2 but can be verified * empirically. */ if (inout_string && !decode_assist()) return (UNHANDLED); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; vmexit->u.inout.bytes = (info1 >> 4) & 0x7; vmexit->u.inout.port = (uint16_t)(info1 >> 16); vmexit->u.inout.eax = (uint32_t)(state->rax); if (inout_string) { vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); vis->rflags = state->rflags; vis->cr0 = state->cr0; vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); svm_inout_str_seginfo(svm_sc, vcpu, info1, vmexit->u.inout.in, vis); } return (UNHANDLED); } static int npf_fault_type(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_W) return (VM_PROT_WRITE); else if (exitinfo1 & VMCB_NPF_INFO1_ID) return (VM_PROT_EXECUTE); else return (VM_PROT_READ); } static bool svm_npf_emul_fault(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } if (exitinfo1 & VMCB_NPF_INFO1_GPT) { return (false); } if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { return (false); } return (true); } static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; int error, inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } /* * Copy the instruction bytes into 'vie' if available. */ if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = ctrl->inst_bytes; } else { inst_len = 0; inst_bytes = NULL; } vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } #ifdef KTR static const char * intrtype_to_str(int intr_type) { switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: return ("hwintr"); case VMCB_EVENTINJ_TYPE_NMI: return ("nmi"); case VMCB_EVENTINJ_TYPE_INTn: return ("swintr"); case VMCB_EVENTINJ_TYPE_EXCEPTION: return ("exception"); default: panic("%s: unknown intr_type %d", __func__, intr_type); } } #endif /* * Inject an event to vcpu as described in section 15.20, "Event injection". */ static void svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, uint32_t error, bool ec_valid) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event already pending %#lx", __func__, ctrl->eventinj)); KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", __func__, vector)); switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: case VMCB_EVENTINJ_TYPE_NMI: case VMCB_EVENTINJ_TYPE_INTn: break; case VMCB_EVENTINJ_TYPE_EXCEPTION: if (vector >= 0 && vector <= 31 && vector != 2) break; /* FALLTHROUGH */ default: panic("%s: invalid intr_type/vector: %d/%d", __func__, intr_type, vector); } ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; if (ec_valid) { ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; ctrl->eventinj |= (uint64_t)error << 32; VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", intrtype_to_str(intr_type), vector, error); } else { VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", intrtype_to_str(intr_type), vector); } } static void svm_update_virqinfo(struct svm_softc *sc, int vcpu) { struct vm *vm; struct vlapic *vlapic; struct vmcb_ctrl *ctrl; int pending; vm = sc->vm; vlapic = vm_lapic(vm, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); /* Update %cr8 in the emulated vlapic */ vlapic_set_cr8(vlapic, ctrl->v_tpr); /* * If V_IRQ indicates that the interrupt injection attempted on then * last VMRUN was successful then update the vlapic accordingly. */ if (ctrl->v_intr_vector != 0) { pending = ctrl->v_irq; KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid " "v_intr_vector %d", __func__, ctrl->v_intr_vector)); KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector, pending ? "pending" : "accepted"); if (!pending) vlapic_intr_accepted(vlapic, ctrl->v_intr_vector); } } static void svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) { struct vmcb_ctrl *ctrl; uint64_t intinfo; ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); intinfo = ctrl->exitintinfo; if (!VMCB_EXITINTINFO_VALID(intinfo)) return; /* * From APMv2, Section "Intercepts during IDT interrupt delivery" * * If a #VMEXIT happened during event delivery then record the event * that was being delivered. */ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); } static __inline int vintr_intercept_enabled(struct svm_softc *sc, int vcpu) { return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); } static __inline void enable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); KASSERT(vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be enabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); ctrl->v_irq = 1; ctrl->v_ign_tpr = 1; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static __inline void disable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(!vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be disabled", __func__)); return; } #ifdef KTR if (ctrl->v_intr_vector == 0) VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); else VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection"); #endif ctrl->v_irq = 0; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static int svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) { struct vmcb_ctrl *ctrl; int oldval, newval; ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intr_shadow; newval = val ? 1 : 0; if (newval != oldval) { ctrl->intr_shadow = newval; VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); } return (0); } static int svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); *val = ctrl->intr_shadow; return (0); } /* * Once an NMI is injected it blocks delivery of further NMIs until the handler * executes an IRET. The IRET intercept is enabled when an NMI is injected to * to track when the vcpu is done handling the NMI. */ static int nmi_blocked(struct svm_softc *sc, int vcpu) { int blocked; blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); return (blocked); } static void enable_nmi_blocking(struct svm_softc *sc, int vcpu) { KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); } static void clear_nmi_blocking(struct svm_softc *sc, int vcpu) { int error; KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); /* * When the IRET intercept is cleared the vcpu will attempt to execute * the "iret" when it runs next. However, it is possible to inject * another NMI into the vcpu before the "iret" has actually executed. * * For e.g. if the "iret" encounters a #NPF when accessing the stack * it will trap back into the hypervisor. If an NMI is pending for * the vcpu it will be injected into the guest. * * XXX this needs to be fixed */ svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* * Set 'intr_shadow' to prevent an NMI from being injected on the * immediate VMRUN. */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); } #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL static int svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) { struct vm_exit *vme; struct vmcb_state *state; uint64_t changed, lma, oldval; int error; state = svm_get_vmcb_state(sc, vcpu); oldval = state->efer; VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ changed = oldval ^ newval; if (newval & EFER_MBZ_BITS) goto gpf; /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ if (changed & EFER_LME) { if (state->cr0 & CR0_PG) goto gpf; } /* EFER.LMA = EFER.LME & CR0.PG */ if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) lma = EFER_LMA; else lma = 0; if ((newval & EFER_LMA) != lma) goto gpf; if (newval & EFER_NXE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) goto gpf; } /* * XXX bhyve does not enforce segment limits in 64-bit mode. Until * this is fixed flag guest attempt to set EFER_LMSLE as an error. */ if (newval & EFER_LMSLE) { vme = vm_exitinfo(sc->vm, vcpu); vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); *retu = true; return (0); } if (newval & EFER_FFXSR) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) goto gpf; } if (newval & EFER_TCE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) goto gpf; } error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); return (0); gpf: vm_inject_gp(sc->vm, vcpu); return (0); } static int emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); else if (num == MSR_EFER) error = svm_write_efer(sc, vcpu, val, retu); else error = svm_wrmsr(sc, vcpu, num, val, retu); return (error); } static int emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) { struct vmcb_state *state; struct svm_regctx *ctx; uint64_t result; int error; if (lapic_msr(num)) error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); else error = svm_rdmsr(sc, vcpu, num, &result, retu); if (error == 0) { state = svm_get_vmcb_state(sc, vcpu); ctx = svm_get_guest_regctx(sc, vcpu); state->rax = result & 0xffffffff; ctx->sctx_rdx = result >> 32; } return (error); } #ifdef KTR static const char * exit_reason_to_str(uint64_t reason) { static char reasonbuf[32]; switch (reason) { case VMCB_EXIT_INVALID: return ("invalvmcb"); case VMCB_EXIT_SHUTDOWN: return ("shutdown"); case VMCB_EXIT_NPF: return ("nptfault"); case VMCB_EXIT_PAUSE: return ("pause"); case VMCB_EXIT_HLT: return ("hlt"); case VMCB_EXIT_CPUID: return ("cpuid"); case VMCB_EXIT_IO: return ("inout"); case VMCB_EXIT_MC: return ("mchk"); case VMCB_EXIT_INTR: return ("extintr"); case VMCB_EXIT_NMI: return ("nmi"); case VMCB_EXIT_VINTR: return ("vintr"); case VMCB_EXIT_MSR: return ("msr"); case VMCB_EXIT_IRET: return ("iret"); case VMCB_EXIT_MONITOR: return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); } } #endif /* KTR */ /* * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs * that are due to instruction intercepts as well as MSR and IOIO intercepts * and exceptions caused by INT3, INTO and BOUND instructions. * * Return 1 if the nRIP is valid and 0 otherwise. */ static int nrip_valid(uint64_t exitcode) { switch (exitcode) { case 0x00 ... 0x0F: /* read of CR0 through CR15 */ case 0x10 ... 0x1F: /* write of CR0 through CR15 */ case 0x20 ... 0x2F: /* read of DR0 through DR15 */ case 0x30 ... 0x3F: /* write of DR0 through DR15 */ case 0x43: /* INT3 */ case 0x44: /* INTO */ case 0x45: /* BOUND */ case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ return (1); default: return (0); } } static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; handled = 0; code = ctrl->exitcode; info1 = ctrl->exitinfo1; info2 = ctrl->exitinfo2; vmexit->exitcode = VM_EXITCODE_BOGUS; vmexit->rip = state->rip; vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); /* * #VMEXIT(INVALID) needs to be handled early because the VMCB is * in an inconsistent state and can trigger assertions that would * never happen otherwise. */ if (code == VMCB_EXIT_INVALID) { vm_exit_svm(vmexit, code, info1, info2); return (0); } KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " "injection valid bit is set %#lx", __func__, ctrl->eventinj)); KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", vmexit->inst_length, code, info1, info2)); svm_update_virqinfo(svm_sc, vcpu); svm_save_intinfo(svm_sc, vcpu); switch (code) { case VMCB_EXIT_IRET: /* * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; clear_nmi_blocking(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); handled = 1; break; case VMCB_EXIT_INTR: /* external interrupt */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); handled = 1; break; case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; switch (idtvec) { case IDT_MC: /* * Call the machine check handler by hand. Also don't * reflect the machine check back into the guest. */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, info2); KASSERT(error == 0, ("%s: error %d updating cr2", __func__, error)); /* fallthru */ case IDT_NP: case IDT_SS: case IDT_GP: case IDT_AC: case IDT_TS: errcode_valid = 1; break; case IDT_DF: errcode_valid = 1; info1 = 0; break; case IDT_BP: case IDT_OF: case IDT_BR: /* * The 'nrip' field is populated for INT3, INTO and * BOUND exceptions and this also implies that * 'inst_length' is non-zero. * * Reset 'inst_length' to zero so the guest %rip at * event injection is identical to what it was when * the exception originally happened. */ VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " "to zero before injecting exception %d", vmexit->inst_length, idtvec); vmexit->inst_length = 0; /* fallthru */ default: errcode_valid = 0; info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " "when reflecting exception %d into guest", vmexit->inst_length, idtvec)); if (reflect) { /* Reflect the exception back into the guest */ VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " "%d/%#x into the guest", idtvec, (int)info1); error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } handled = 1; break; case VMCB_EXIT_MSR: /* MSR access. */ eax = state->rax; ecx = ctx->sctx_rcx; edx = ctx->sctx_rdx; retu = false; if (info1) { vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); val = (uint64_t)edx << 32 | eax; VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", ecx, val); if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = val; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } } else { VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } } break; case VMCB_EXIT_IO: handled = svm_handle_io(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); handled = x86_emulate_cpuid(svm_sc->vm, vcpu, (uint32_t *)&state->rax, (uint32_t *)&ctx->sctx_rbx, (uint32_t *)&ctx->sctx_rcx, (uint32_t *)&ctx->sctx_rdx); break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = state->rflags; break; case VMCB_EXIT_PAUSE: vmexit->exitcode = VM_EXITCODE_PAUSE; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); break; case VMCB_EXIT_NPF: /* EXITINFO2 contains the faulting guest physical address */ if (info1 & VMCB_NPF_INFO1_RSV) { VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { svm_handle_inst_emul(vmcb, info2, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } break; case VMCB_EXIT_MONITOR: vmexit->exitcode = VM_EXITCODE_MONITOR; break; case VMCB_EXIT_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), vmexit->rip, vmexit->inst_length); if (handled) { vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; state->rip = vmexit->rip; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic SVM exit. */ vm_exit_svm(vmexit, code, info1, info2); } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static void svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) { uint64_t intinfo; if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) return; KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " "valid: %#lx", __func__, intinfo)); svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), VMCB_EXITINTINFO_VECTOR(intinfo), VMCB_EXITINTINFO_EC(intinfo), VMCB_EXITINTINFO_EC_VALID(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); } /* * Inject event to virtual cpu. */ static void svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window, pending_apic_vector; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; pending_apic_vector = 0; if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vcpustate->nextrip, state->rip); } /* * Inject pending events or exceptions for this vcpu. * * An event might be pending because the previous #VMEXIT happened * during event delivery (i.e. ctrl->exitintinfo). * * An event might also be pending because an exception was injected * by the hypervisor (e.g. #PF during instruction emulation). */ svm_inj_intinfo(sc, vcpu); /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { if (nmi_blocked(sc, vcpu)) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " "to NMI-blocking"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " "interrupt shadow"); need_intr_window = 1; goto done; } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { /* * If there is already an exception/interrupt pending * then defer the NMI until after that. */ VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " "eventinj %#lx", ctrl->eventinj); /* * Use self-IPI to trigger a VM-exit as soon as * possible after the event injection is completed. * * This works only if the external interrupt exiting * is at a lower priority than the event injection. * * Although not explicitly specified in APMv2 the * relative priorities were verified empirically. */ ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ } else { vm_nmi_clear(sc->vm, vcpu); /* Inject NMI, vector number is not used */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ enable_nmi_blocking(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } } if (!vm_extint_pending(sc->vm, vcpu)) { /* * APIC interrupts are delivered using the V_IRQ offload. * * The primary benefit is that the hypervisor doesn't need to * deal with the various conditions that inhibit interrupts. * It also means that TPR changes via CR8 will be handled * without any hypervisor involvement. * * Note that the APIC vector must remain pending in the vIRR * until it is confirmed that it was delivered to the guest. * This can be confirmed based on the value of V_IRQ at the * next #VMEXIT (1 = pending, 0 = delivered). * * Also note that it is possible that another higher priority * vector can become pending before this vector is delivered * to the guest. This is alright because vcpu_notify_event() * will send an IPI and force the vcpu to trap back into the * hypervisor. The higher priority vector will be injected on * the next VMRUN. */ if (vlapic_pending_intr(vlapic, &vector)) { KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); pending_apic_vector = vector; } goto done; } /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(sc->vm, &vector); KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); /* * If the guest has disabled interrupts or is in an interrupt shadow * then we cannot inject the pending interrupt. */ if ((state->rflags & PSL_I) == 0) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, state->rflags); need_intr_window = 1; goto done; } if (ctrl->intr_shadow) { VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " "interrupt shadow", vector); need_intr_window = 1; goto done; } if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "eventinj %#lx", vector, ctrl->eventinj); need_intr_window = 1; goto done; } /* * Legacy PIC interrupts are delivered via the event injection * mechanism. */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); vm_extint_clear(sc->vm, vcpu); vatpic_intr_accepted(sc->vm, vector); /* * Force a VM-exit as soon as the vcpu is ready to accept another * interrupt. This is done because the PIC might have another vector * that it wants to inject. Also, if the APIC has a pending interrupt * that was preempted by the ExtInt then it allows us to inject the * APIC vector as soon as possible. */ need_intr_window = 1; done: /* * The guest can modify the TPR by writing to %CR8. In guest mode * the processor reflects this write to V_TPR without hypervisor * intervention. * * The guest can also modify the TPR by writing to it via the memory * mapped APIC page. In this case, the write will be emulated by the * hypervisor. For this reason V_TPR must be updated before every * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); ctrl->v_tpr = v_tpr; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } if (pending_apic_vector) { /* * If an APIC vector is being injected then interrupt window * exiting is not possible on this VMRUN. */ KASSERT(!need_intr_window, ("intr_window exiting impossible")); VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ", pending_apic_vector); ctrl->v_irq = 1; ctrl->v_ign_tpr = 0; ctrl->v_intr_vector = pending_apic_vector; ctrl->v_intr_prio = pending_apic_vector >> 4; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } else if (need_intr_window) { /* * We use V_IRQ in conjunction with the VINTR intercept to * trap into the hypervisor as soon as a virtual interrupt * can be delivered. * * Since injected events are not subject to intercept checks * we need to ensure that the V_IRQ is not actually going to * be delivered on VM entry. The KASSERT below enforces this. */ KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, ("Bogus intr_window_exiting: eventinj (%#lx), " "intr_shadow (%u), rflags (%#lx)", ctrl->eventinj, ctrl->intr_shadow, state->rflags)); enable_intr_window_exiting(sc, vcpu); } else { disable_intr_window_exiting(sc, vcpu); } } static __inline void restore_host_tss(void) { struct system_segment_descriptor *tss_sd; /* * The TSS descriptor was in use prior to launching the guest so it * has been marked busy. * * 'ltr' requires the descriptor to be marked available so change the * type to "64-bit available TSS". */ tss_sd = PCPU_GET(tss); tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); } static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; long eptgen; bool alloc_asid; KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " "active on cpu %u", __func__, thiscpu)); vcpustate = svm_get_vcpu(sc, vcpuid); ctrl = svm_get_vmcb_ctrl(sc, vcpuid); /* * The TLB entries associated with the vcpu's ASID are not valid * if either of the following conditions is true: * * 1. The vcpu's ASID generation is different than the host cpu's * ASID generation. This happens when the vcpu migrates to a new * host cpu. It can also happen when the number of vcpus executing * on a host cpu is greater than the number of ASIDs available. * * 2. The pmap generation number is different than the value cached in * the 'vcpustate'. This happens when the host invalidates pages * belonging to the guest. * * asidgen eptgen Action * mismatch mismatch * 0 0 (a) * 0 1 (b1) or (b2) * 1 0 (c) * 1 1 (d) * * (a) There is no mismatch in eptgen or ASID generation and therefore * no further action is needed. * * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is * retained and the TLB entries associated with this ASID * are flushed by VMRUN. * * (b2) If the cpu does not support FlushByAsid then a new ASID is * allocated. * * (c) A new ASID is allocated. * * (d) A new ASID is allocated. */ alloc_asid = false; eptgen = pmap->pm_eptgen; ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; if (vcpustate->asid.gen != asid[thiscpu].gen) { alloc_asid = true; /* (c) and (d) */ } else if (vcpustate->eptgen != eptgen) { if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ else alloc_asid = true; /* (b2) */ } else { /* * This is the common case (a). */ KASSERT(!alloc_asid, ("ASID allocation not necessary")); KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); } if (alloc_asid) { if (++asid[thiscpu].num >= nasid) { asid[thiscpu].num = 1; if (++asid[thiscpu].gen == 0) asid[thiscpu].gen = 1; /* * If this cpu does not support "flush-by-asid" * then flush the entire TLB on a generation * bump. Subsequent ASID allocation in this * generation can be done without a TLB flush. */ if (!flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; } vcpustate->asid.gen = asid[thiscpu].gen; vcpustate->asid.num = asid[thiscpu].num; ctrl->asid = vcpustate->asid.num; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* * If this cpu supports "flush-by-asid" then the TLB * was not flushed after the generation bump. The TLB * is flushed selectively after every new ASID allocation. */ if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; } vcpustate->eptgen = eptgen; KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } static __inline void disable_gintr(void) { __asm __volatile("clgi"); } static __inline void enable_gintr(void) { __asm __volatile("stgi"); } /* * Start vcpu with specified RIP. */ static int svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { struct svm_regctx *gctx; struct svm_softc *svm_sc; struct svm_vcpu *vcpustate; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; struct vm *vm; uint64_t vmcb_pa; int handled; svm_sc = arg; vm = svm_sc->vm; vcpustate = svm_get_vcpu(svm_sc, vcpu); state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; if (vcpustate->lastcpu != curcpu) { /* * Force new ASID allocation by invalidating the generation. */ vcpustate->asid.gen = 0; /* * Invalidate the VMCB state cache by marking all fields dirty. */ svm_set_dirty(svm_sc, vcpu, 0xffffffff); /* * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing * the VMRUN instruction. This could happen if a rendezvous * or an AST is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. */ vcpustate->lastcpu = curcpu; vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); } svm_msr_guest_enter(svm_sc, vcpu); /* Update Guest RIP */ state->rip = rip; do { /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state * loaded by the "vmrun" instruction but also software state * maintained by the hypervisor: suspended and rendezvous * state, NPT generation number, vlapic interrupts etc. */ disable_gintr(); if (vcpu_suspended(evinfo)) { enable_gintr(); vm_exit_suspended(vm, vcpu, state->rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_gintr(); vm_exit_rendezvous(vm, vcpu, state->rip); break; } if (vcpu_reqidle(evinfo)) { enable_gintr(); vm_exit_reqidle(vm, vcpu, state->rip); break; } /* We are asked to give the cpu by scheduler. */ if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; } svm_inj_interrupts(svm_sc, vcpu, vlapic); /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); /* * Check the pmap generation and the ASID generation to * ensure that the vcpu does not use stale TLB mappings. */ check_asid(svm_sc, vcpu, pmap, curcpu); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); /* Launch Virtual Machine. */ VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); /* * The host GDTR and IDTR is saved by VMRUN and restored * automatically on #VMEXIT. However, the host TSS needs * to be restored explicitly. */ restore_host_tss(); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); /* Update 'nextrip' */ vcpustate->nextrip = state->rip; /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); svm_msr_guest_exit(svm_sc, vcpu); return (0); } static void svm_vmcleanup(void *arg) { struct svm_softc *sc = arg; contigfree(sc, sizeof (*sc), M_SVM); } static register_t * swctx_regptr(struct svm_regctx *regctx, int reg) { switch (reg) { case VM_REG_GUEST_RBX: return (®ctx->sctx_rbx); case VM_REG_GUEST_RCX: return (®ctx->sctx_rcx); case VM_REG_GUEST_RDX: return (®ctx->sctx_rdx); case VM_REG_GUEST_RDI: return (®ctx->sctx_rdi); case VM_REG_GUEST_RSI: return (®ctx->sctx_rsi); case VM_REG_GUEST_RBP: return (®ctx->sctx_rbp); case VM_REG_GUEST_R8: return (®ctx->sctx_r8); case VM_REG_GUEST_R9: return (®ctx->sctx_r9); case VM_REG_GUEST_R10: return (®ctx->sctx_r10); case VM_REG_GUEST_R11: return (®ctx->sctx_r11); case VM_REG_GUEST_R12: return (®ctx->sctx_r12); case VM_REG_GUEST_R13: return (®ctx->sctx_r13); case VM_REG_GUEST_R14: return (®ctx->sctx_r14); case VM_REG_GUEST_R15: return (®ctx->sctx_r15); default: return (NULL); } } static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_get_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *val = *reg; return (0); } VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); return (EINVAL); } static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_modify_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *reg = val; return (0); } /* * XXX deal with CR3 and invalidate TLB entries tagged with the * vcpu's ASID. This needs to be treated differently depending on * whether 'running' is true/false. */ VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); return (EINVAL); } static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT, val); break; case VM_CAP_PAUSE_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; case VM_CAP_UNRESTRICTED_GUEST: /* Unrestricted guest execution cannot be disabled in SVM */ if (val == 0) error = EINVAL; break; default: error = ENOENT; break; } return (error); } static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); break; case VM_CAP_PAUSE_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; default: error = ENOENT; break; } return (error); } static struct vlapic * svm_vlapic_init(void *arg, int vcpuid) { struct svm_softc *svm_sc; struct vlapic *vlapic; svm_sc = arg; vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = svm_sc->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; vlapic_init(vlapic); return (vlapic); } static void svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_SVM_VLAPIC); } struct vmm_ops vmm_ops_amd = { svm_init, svm_cleanup, svm_restore, svm_vminit, svm_vmrun, svm_vmcleanup, svm_getreg, svm_setreg, vmcb_getdesc, vmcb_setdesc, svm_getcap, svm_setcap, svm_npt_alloc, svm_npt_free, svm_vlapic_init, svm_vlapic_cleanup }; Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_cir.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_cir.c (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_cir.c (revision 308054) @@ -0,0 +1,535 @@ +/*- + * Copyright (c) 2016 Ganbold Tsagaankhuu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Allwinner Consumer IR controller + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define READ(_sc, _r) bus_read_4((_sc)->res[0], (_r)) +#define WRITE(_sc, _r, _v) bus_write_4((_sc)->res[0], (_r), (_v)) + +/* IR Control */ +#define AW_IR_CTL 0x00 +/* Global Enable */ +#define AW_IR_CTL_GEN (1 << 0) +/* RX enable */ +#define AW_IR_CTL_RXEN (1 << 1) +/* CIR mode enable */ +#define AW_IR_CTL_MD (1 << 4) | (1 << 5) + +/* RX Config Reg */ +#define AW_IR_RXCTL 0x10 +/* Pulse Polarity Invert flag */ +#define AW_IR_RXCTL_RPPI (1 << 2) + +/* RX Data */ +#define AW_IR_RXFIFO 0x20 + +/* RX Interrupt Control */ +#define AW_IR_RXINT 0x2C +/* RX FIFO Overflow */ +#define AW_IR_RXINT_ROI_EN (1 << 0) +/* RX Packet End */ +#define AW_IR_RXINT_RPEI_EN (1 << 1) +/* RX FIFO Data Available */ +#define AW_IR_RXINT_RAI_EN (1 << 4) +/* RX FIFO available byte level */ +#define AW_IR_RXINT_RAL(val) ((val) << 8) + +/* RX Interrupt Status Reg */ +#define AW_IR_RXSTA 0x30 +/* RX FIFO Get Available Counter */ +#define AW_IR_RXSTA_COUNTER(val) (((val) >> 8) & (sc->fifo_size * 2 - 1)) +/* Clear all interrupt status */ +#define AW_IR_RXSTA_CLEARALL 0xff + +/* IR Sample Configure Reg */ +#define AW_IR_CIR 0x34 +/* Filter Threshold = 8 * 21.3 = ~128us < 200us */ +#define AW_IR_RXFILT_VAL (((8) & 0x3f) << 2) +/* Idle Threshold = (2 + 1) * 128 * 42.7 = ~16.4ms > 9ms */ +#define AW_IR_RXIDLE_VAL (((2) & 0xff) << 8) + +/* Bit 15 - value (pulse/space) */ +#define VAL_MASK 0x80 +/* Bits 0:14 - sample duration */ +#define PERIOD_MASK 0x7f + +/* Clock rate for IR0 or IR1 clock in CIR mode */ +#define AW_IR_BASE_CLK 3000000 +/* Frequency sample 3MHz/64 = 46875Hz (21.3us) */ +#define AW_IR_SAMPLE_64 (0 << 0) +/* Frequency sample 3MHz/128 = 23437.5Hz (42.7us) */ +#define AW_IR_SAMPLE_128 (1 << 0) + +#define AW_IR_ERROR_CODE 0xffffffff +#define AW_IR_REPEAT_CODE 0x0 + +/* 80 * 42.7 = ~3.4ms, Lead1(4.5ms) > AW_IR_L1_MIN */ +#define AW_IR_L1_MIN 80 +/* 40 * 42.7 = ~1.7ms, Lead0(4.5ms) Lead0R(2.25ms) > AW_IR_L0_MIN */ +#define AW_IR_L0_MIN 40 +/* 26 * 42.7 = ~1109us ~= 561 * 2, Pulse < AW_IR_PMAX */ +#define AW_IR_PMAX 26 +/* 26 * 42.7 = ~1109us ~= 561 * 2, D1 > AW_IR_DMID, D0 <= AW_IR_DMID */ +#define AW_IR_DMID 26 +/* 53 * 42.7 = ~2263us ~= 561 * 4, D < AW_IR_DMAX */ +#define AW_IR_DMAX 53 + +/* Active Thresholds */ +#define AW_IR_ACTIVE_T ((0 & 0xff) << 16) +#define AW_IR_ACTIVE_T_C ((1 & 0xff) << 23) + +/* Code masks */ +#define CODE_MASK 0x00ff00ff +#define INV_CODE_MASK 0xff00ff00 +#define VALID_CODE_MASK 0x00ff0000 + +#define A10_IR 1 +#define A13_IR 2 + +#define AW_IR_RAW_BUF_SIZE 128 + +struct aw_ir_softc { + device_t dev; + struct resource *res[2]; + void * intrhand; + int fifo_size; + int dcnt; /* Packet Count */ + unsigned char buf[AW_IR_RAW_BUF_SIZE]; + struct evdev_dev *sc_evdev; +}; + +static struct resource_spec aw_ir_spec[] = { + { SYS_RES_MEMORY, 0, RF_ACTIVE }, + { SYS_RES_IRQ, 0, RF_ACTIVE | RF_SHAREABLE }, + { -1, 0 } +}; + +static struct ofw_compat_data compat_data[] = { + { "allwinner,sun4i-a10-ir", A10_IR }, + { "allwinner,sun5i-a13-ir", A13_IR }, + { NULL, 0 } +}; + +static void +aw_ir_buf_reset(struct aw_ir_softc *sc) +{ + + sc->dcnt = 0; +} + +static void +aw_ir_buf_write(struct aw_ir_softc *sc, unsigned char data) +{ + + if (sc->dcnt < AW_IR_RAW_BUF_SIZE) + sc->buf[sc->dcnt++] = data; + else + if (bootverbose) + device_printf(sc->dev, "IR RX Buffer Full!\n"); +} + +static int +aw_ir_buf_full(struct aw_ir_softc *sc) +{ + + return (sc->dcnt >= AW_IR_RAW_BUF_SIZE); +} + +static unsigned char +aw_ir_read_data(struct aw_ir_softc *sc) +{ + + return (unsigned char)(READ(sc, AW_IR_RXFIFO) & 0xff); +} + +static unsigned long +aw_ir_decode_packets(struct aw_ir_softc *sc) +{ + unsigned long len, code; + unsigned char val, last; + unsigned int active_delay; + int i, bitcount; + + if (bootverbose) + device_printf(sc->dev, "sc->dcnt = %d\n", sc->dcnt); + + /* Find Lead 1 (bit separator) */ + active_delay = (AW_IR_ACTIVE_T + 1) * (AW_IR_ACTIVE_T_C ? 128 : 1); + len = 0; + len += (active_delay >> 1); + if (bootverbose) + device_printf(sc->dev, "Initial len: %ld\n", len); + for (i = 0; i < sc->dcnt; i++) { + val = sc->buf[i]; + if (val & VAL_MASK) + len += val & PERIOD_MASK; + else { + if (len > AW_IR_L1_MIN) + break; + len = 0; + } + } + if (bootverbose) + device_printf(sc->dev, "len = %ld\n", len); + if ((val & VAL_MASK) || (len <= AW_IR_L1_MIN)) { + if (bootverbose) + device_printf(sc->dev, "Bit separator error\n"); + goto error_code; + } + + /* Find Lead 0 (bit length) */ + len = 0; + for (; i < sc->dcnt; i++) { + val = sc->buf[i]; + if (val & VAL_MASK) { + if(len > AW_IR_L0_MIN) + break; + len = 0; + } else + len += val & PERIOD_MASK; + } + if ((!(val & VAL_MASK)) || (len <= AW_IR_L0_MIN)) { + if (bootverbose) + device_printf(sc->dev, "Bit length error\n"); + goto error_code; + } + + /* Start decoding */ + code = 0; + bitcount = 0; + last = 1; + len = 0; + for (; i < sc->dcnt; i++) { + val = sc->buf[i]; + if (last) { + if (val & VAL_MASK) + len += val & PERIOD_MASK; + else { + if (len > AW_IR_PMAX) { + if (bootverbose) + device_printf(sc->dev, + "Pulse error\n"); + goto error_code; + } + last = 0; + len = val & PERIOD_MASK; + } + } else { + if (val & VAL_MASK) { + if (len > AW_IR_DMAX) { + if (bootverbose) + device_printf(sc->dev, + "Distant error\n"); + goto error_code; + } else { + if (len > AW_IR_DMID) { + /* Decode */ + code |= 1 << bitcount; + } + bitcount++; + if (bitcount == 32) + break; /* Finish decoding */ + } + last = 1; + len = val & PERIOD_MASK; + } else + len += val & PERIOD_MASK; + } + } + return (code); + +error_code: + + return (AW_IR_ERROR_CODE); +} + +static int +aw_ir_validate_code(unsigned long code) +{ + unsigned long v1, v2; + + /* Don't check address */ + v1 = code & CODE_MASK; + v2 = (code & INV_CODE_MASK) >> 8; + + if (((v1 ^ v2) & VALID_CODE_MASK) == VALID_CODE_MASK) + return (0); /* valid */ + else + return (1); /* invalid */ +} + +static void +aw_ir_intr(void *arg) +{ + struct aw_ir_softc *sc; + uint32_t val; + int i, dcnt; + unsigned long ir_code; + int stat; + + sc = (struct aw_ir_softc *)arg; + + /* Read RX interrupt status */ + val = READ(sc, AW_IR_RXSTA); + + /* Clean all pending interrupt statuses */ + WRITE(sc, AW_IR_RXSTA, val | AW_IR_RXSTA_CLEARALL); + + /* When Rx FIFO Data available or Packet end */ + if (val & (AW_IR_RXINT_RAI_EN | AW_IR_RXINT_RPEI_EN)) { + /* Get available message count in RX FIFO */ + dcnt = AW_IR_RXSTA_COUNTER(val); + /* Read FIFO */ + for (i = 0; i < dcnt; i++) { + if (aw_ir_buf_full(sc)) { + if (bootverbose) + device_printf(sc->dev, + "raw buffer full\n"); + break; + } else + aw_ir_buf_write(sc, aw_ir_read_data(sc)); + } + } + + if (val & AW_IR_RXINT_RPEI_EN) { + /* RX Packet end */ + if (bootverbose) + device_printf(sc->dev, "RX Packet end\n"); + ir_code = aw_ir_decode_packets(sc); + stat = aw_ir_validate_code(ir_code); + if (stat == 0) { + evdev_push_event(sc->sc_evdev, + EV_MSC, MSC_SCAN, ir_code); + evdev_sync(sc->sc_evdev); + } + if (bootverbose) { + device_printf(sc->dev, "Final IR code: %lx\n", + ir_code); + device_printf(sc->dev, "IR code status: %d\n", + stat); + } + sc->dcnt = 0; + } + if (val & AW_IR_RXINT_ROI_EN) { + /* RX FIFO overflow */ + if (bootverbose) + device_printf(sc->dev, "RX FIFO overflow\n"); + /* Flush raw buffer */ + aw_ir_buf_reset(sc); + } +} + +static int +aw_ir_probe(device_t dev) +{ + + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + return (ENXIO); + + device_set_desc(dev, "Allwinner CIR controller"); + return (BUS_PROBE_DEFAULT); +} + +static int +aw_ir_attach(device_t dev) +{ + struct aw_ir_softc *sc; + hwreset_t rst_apb; + clk_t clk_ir, clk_gate; + int err; + uint32_t val = 0; + + clk_ir = clk_gate = NULL; + + sc = device_get_softc(dev); + sc->dev = dev; + + if (bus_alloc_resources(dev, aw_ir_spec, sc->res) != 0) { + device_printf(dev, "could not allocate memory resource\n"); + return (ENXIO); + } + + switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) { + case A10_IR: + sc->fifo_size = 16; + break; + case A13_IR: + sc->fifo_size = 64; + break; + } + + /* De-assert reset */ + if (hwreset_get_by_ofw_name(dev, 0, "apb", &rst_apb) == 0) { + err = hwreset_deassert(rst_apb); + if (err != 0) { + device_printf(dev, "cannot de-assert reset\n"); + goto error; + } + } + + /* Reset buffer */ + aw_ir_buf_reset(sc); + + /* Get clocks and enable them */ + err = clk_get_by_ofw_name(dev, 0, "apb", &clk_gate); + if (err != 0) { + device_printf(dev, "Cannot get gate clock\n"); + goto error; + } + err = clk_get_by_ofw_name(dev, 0, "ir", &clk_ir); + if (err != 0) { + device_printf(dev, "Cannot get IR clock\n"); + goto error; + } + /* Set clock rate */ + err = clk_set_freq(clk_ir, AW_IR_BASE_CLK, 0); + if (err != 0) { + device_printf(dev, "cannot set IR clock rate\n"); + goto error; + } + /* Enable clocks */ + err = clk_enable(clk_gate); + if (err != 0) { + device_printf(dev, "Cannot enable clk gate\n"); + goto error; + } + err = clk_enable(clk_ir); + if (err != 0) { + device_printf(dev, "Cannot enable IR clock\n"); + goto error; + } + + if (bus_setup_intr(dev, sc->res[1], + INTR_TYPE_MISC | INTR_MPSAFE, NULL, aw_ir_intr, sc, + &sc->intrhand)) { + bus_release_resources(dev, aw_ir_spec, sc->res); + device_printf(dev, "cannot setup interrupt handler\n"); + return (ENXIO); + } + + /* Enable CIR Mode */ + WRITE(sc, AW_IR_CTL, AW_IR_CTL_MD); + + /* + * Set clock sample, filter, idle thresholds. + * Frequency sample = 3MHz/128 = 23437.5Hz (42.7us) + */ + val = AW_IR_SAMPLE_128; + val |= (AW_IR_RXFILT_VAL | AW_IR_RXIDLE_VAL); + val |= (AW_IR_ACTIVE_T | AW_IR_ACTIVE_T_C); + WRITE(sc, AW_IR_CIR, val); + + /* Invert Input Signal */ + WRITE(sc, AW_IR_RXCTL, AW_IR_RXCTL_RPPI); + + /* Clear All RX Interrupt Status */ + WRITE(sc, AW_IR_RXSTA, AW_IR_RXSTA_CLEARALL); + + /* + * Enable RX interrupt in case of overflow, packet end + * and FIFO available. + * RX FIFO Threshold = FIFO size / 2 + */ + WRITE(sc, AW_IR_RXINT, AW_IR_RXINT_ROI_EN | AW_IR_RXINT_RPEI_EN | + AW_IR_RXINT_RAI_EN | AW_IR_RXINT_RAL((sc->fifo_size >> 1) - 1)); + + /* Enable IR Module */ + val = READ(sc, AW_IR_CTL); + WRITE(sc, AW_IR_CTL, val | AW_IR_CTL_GEN | AW_IR_CTL_RXEN); + + sc->sc_evdev = evdev_alloc(); + evdev_set_name(sc->sc_evdev, device_get_desc(sc->dev)); + evdev_set_phys(sc->sc_evdev, device_get_nameunit(sc->dev)); + evdev_set_id(sc->sc_evdev, BUS_HOST, 0, 0, 0); + evdev_support_event(sc->sc_evdev, EV_SYN); + evdev_support_event(sc->sc_evdev, EV_MSC); + evdev_support_msc(sc->sc_evdev, MSC_SCAN); + + err = evdev_register(sc->sc_evdev); + if (err) { + device_printf(dev, + "failed to register evdev: error=%d\n", err); + goto error; + } + + return (0); +error: + if (clk_gate != NULL) + clk_release(clk_gate); + if (clk_ir != NULL) + clk_release(clk_ir); + if (rst_apb != NULL) + hwreset_release(rst_apb); + evdev_free(sc->sc_evdev); + sc->sc_evdev = NULL; /* Avoid double free */ + + bus_release_resources(dev, aw_ir_spec, sc->res); + return (ENXIO); +} + +static device_method_t aw_ir_methods[] = { + DEVMETHOD(device_probe, aw_ir_probe), + DEVMETHOD(device_attach, aw_ir_attach), + + DEVMETHOD_END +}; + +static driver_t aw_ir_driver = { + "aw_ir", + aw_ir_methods, + sizeof(struct aw_ir_softc), +}; +static devclass_t aw_ir_devclass; + +DRIVER_MODULE(aw_ir, simplebus, aw_ir_driver, aw_ir_devclass, 0, 0); +MODULE_DEPEND(aw_ir, evdev, 1, 1, 1); Property changes on: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_cir.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner (revision 308054) @@ -1,57 +1,58 @@ # $FreeBSD$ kern/kern_clocksource.c standard arm/allwinner/a10_ahci.c optional ahci arm/allwinner/a10_codec.c optional sound arm/allwinner/a10_common.c standard arm/allwinner/a10_dmac.c standard arm/allwinner/a10_ehci.c optional ehci arm/allwinner/aw_usbphy.c optional ehci arm/allwinner/a10_gpio.c optional gpio arm/allwinner/a10_mmc.c optional mmc arm/allwinner/a10_sramc.c standard arm/allwinner/aw_nmi.c optional intrng arm/allwinner/aw_if_dwc.c optional dwc arm/allwinner/aw_rsb.c optional rsb | p2wi arm/allwinner/aw_rtc.c standard arm/allwinner/aw_ts.c standard arm/allwinner/aw_wdog.c standard arm/allwinner/aw_machdep.c standard arm/allwinner/aw_mp.c optional smp arm/allwinner/axp209.c optional axp209 arm/allwinner/axp81x.c optional axp81x arm/allwinner/if_awg.c optional awg arm/allwinner/if_emac.c optional emac arm/allwinner/sunxi_dma_if.m standard dev/iicbus/twsi/a10_twsi.c optional twsi dev/usb/controller/generic_ohci.c optional ohci dev/usb/controller/generic_usb_if.m optional ohci arm/allwinner/aw_sid.c standard arm/allwinner/aw_thermal.c standard dev/iicbus/sy8106a.c optional sy8106a +arm/allwinner/aw_cir.c optional aw_cir evdev #arm/allwinner/console.c standard arm/allwinner/a10_fb.c optional vt arm/allwinner/a10_hdmi.c optional hdmi arm/allwinner/a10_hdmiaudio.c optional hdmi sound arm/arm/hdmi_if.m optional hdmi arm/allwinner/aw_reset.c standard arm/allwinner/aw_ccu.c standard arm/allwinner/clk/aw_ahbclk.c standard arm/allwinner/clk/aw_apbclk.c standard arm/allwinner/clk/aw_axiclk.c standard arm/allwinner/clk/aw_codecclk.c standard arm/allwinner/clk/aw_cpuclk.c standard arm/allwinner/clk/aw_cpusclk.c standard arm/allwinner/clk/aw_debeclk.c standard arm/allwinner/clk/aw_gate.c standard arm/allwinner/clk/aw_gmacclk.c standard arm/allwinner/clk/aw_hdmiclk.c standard arm/allwinner/clk/aw_lcdclk.c standard arm/allwinner/clk/aw_modclk.c standard arm/allwinner/clk/aw_mmcclk.c standard arm/allwinner/clk/aw_oscclk.c standard arm/allwinner/clk/aw_pll.c standard arm/allwinner/clk/aw_thsclk.c standard arm/allwinner/clk/aw_usbclk.c standard Index: user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC (revision 308054) @@ -1,178 +1,184 @@ # # GENERICV6 -- Generic(ish) kernel config. # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ ident GENERIC cpu CPU_CORTEXA_MP machine arm armv6 makeoptions CONF_CFLAGS="-march=armv7a" makeoptions KERNVIRTADDR=0xc0000000 options KERNVIRTADDR=0xc0000000 include "std.armv6" files "../allwinner/files.allwinner" files "../allwinner/a20/files.a20" files "../allwinner/a31/files.a31" files "../allwinner/a83t/files.a83t" files "../allwinner/h3/files.h3" files "../broadcom/bcm2835/files.bcm2836" files "../broadcom/bcm2835/files.bcm283x" files "../nvidia/tegra124/files.tegra124" files "../qemu/files.qemu" options SOC_ALLWINNER_A20 options SOC_ALLWINNER_A31 options SOC_ALLWINNER_A31S options SOC_ALLWINNER_A83T options SOC_ALLWINNER_H3 options SOC_BCM2836 options SCHED_ULE # ULE scheduler options SMP # Enable multiple cores options PLATFORM options PLATFORM_SMP options MULTIDELAY options LINUX_BOOT_ABI # EXT_RESOURCES pseudo devices options EXT_RESOURCES device clk device phy device hwreset device regulator # CPU frequency control device cpufreq # Interrupt controller options INTRNG device gic # ARM Generic Timer device generic_timer # MMC/SD/SDIO Card slot support device sdhci # SD controller device mmc # mmc/sd bus device mmcsd # mmc/sd flash cards # ATA controllers device ahci # AHCI-compatible SATA controllers #device ata # Legacy ATA/SATA controllers # PCI options NEW_PCIB device pci # PCI NICs device re # RealTek 8139C+/8169/8169S/8110S # VirtIO device virtio device virtio_mmio device virtio_blk device vtnet # Console and misc device uart device uart_ns8250 device uart_snps device pl011 device pty device snp device md # Memory "disks" device random # Entropy device device psci # I2C support device iicbus device iic device twsi device rsb # Allwinner Reduced Serial Bus device p2wi # Allwinner Push-Pull Two Wire device axp209 # AXP209 Power Management Unit device axp81x # AXP813/818 Power Management Unit device bcm2835_bsc device icee device sy8106a # SY8106A Buck Regulator # GPIO device gpio device gpioled device gpioregulator +# EVDEV support +device evdev # input event device support +options EVDEV_SUPPORT # evdev support in legacy drivers +device uinput # install /dev/uinput cdev +device aw_cir + # SPI device spibus device bcm2835_spi device scbus # SCSI bus (required for ATA/SCSI) device da # Direct Access (disks) device cd # CD device pass # Passthrough device (direct ATA/SCSI access) # USB support options USB_HOST_ALIGN=64 # Align usb buffers to cache line size. device usb #device uhci device ohci device ehci device dwcotg # DWC OTG controller device umass # Disks/Mass storage - Requires scbus and da device uhid # "Human Interface Devices" device ukbd # Allow keyboard like HIDs to control console # Ethernet device loop device ether device vlan # 802.1Q VLAN support device mii device bpf #device emac # 10/100 integrated EMAC controller device dwc # 10/100/1000 integrated GMAC controller device awg # 10/100/1000 integrated EMAC controller # USB ethernet support, requires miibus device smcphy device smsc device miibus # Sound support device sound # Framebuffer support device vt device kbdmux device ums device videomode device hdmi device vchiq # Pinmux device fdt_pinctrl # Extensible Firmware Interface options EFI # Flattened Device Tree options FDT # Configure using FDT/DTB data makeoptions MODULES_EXTRA="dtb/allwinner dtb/nvidia dtb/rpi" Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 308054) @@ -1,1059 +1,1066 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); /* Declare local functions */ static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); /* * Thread local storage used to indicate when a thread is probing geoms * for their guids. If NULL, this thread is not tasting geoms. If non NULL, * it is looking for a replacement for the vdev_t* that is its value. */ uint_t zfs_geom_probe_vdev_key; static void vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) { int error; uint16_t rate; error = g_getattr("GEOM::rotation_rate", cp, &rate); if (error == 0) vd->vdev_rotation_rate = rate; else vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; } static void vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update) { boolean_t needs_update = B_FALSE; vdev_t *vd; char *physpath; int error, physpath_len; if (g_access(cp, 1, 0, 0) != 0) return; vd = cp->private; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0) { char *old_physpath; /* g_topology lock ensures that vdev has not been closed */ g_topology_assert(); old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); if (old_physpath != NULL) { needs_update = (strcmp(old_physpath, vd->vdev_physpath) != 0); spa_strfree(old_physpath); } else needs_update = do_null_update; } g_free(physpath); /* * If the physical path changed, update the config. * Only request an update for previously unset physpaths if * requested by the caller. */ if (needs_update) spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { vdev_t *vd; char *old_physpath; int error; vd = cp->private; if (vd == NULL) return; if (strcmp(attr, "GEOM::rotation_rate") == 0) { vdev_geom_set_rotation_rate(vd, cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE); return; } } static void vdev_geom_orphan(struct g_consumer *cp) { vdev_t *vd; g_topology_assert(); vd = cp->private; if (vd == NULL) { /* Vdev close in progress. Ignore the event. */ return; } /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd) { struct g_geom *gp; struct g_consumer *cp; int error; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n", pp->name, pp->sectorsize); return (NULL); } else if (pp->mediasize < SPA_MINDEVSIZE) { ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n", pp->name, pp->mediasize); return (NULL); } /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { - ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); return (NULL); } ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } /* * BUG: cp may already belong to a vdev. This could happen if: * 1) That vdev is a shared spare, or * 2) We are trying to reopen a missing vdev and we are scanning by * guid. In that case, we'll ultimately fail to open this consumer, * but not until after setting the private field. * The solution is to: * 1) Don't set the private field until after the open succeeds, and * 2) Set it to a linked list of vdevs, not just a single vdev */ cp->private = vd; if (vd != NULL) { vd->vdev_tsd = cp; vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE); } cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) { struct g_geom *gp; vdev_t *vd; g_topology_assert(); - ZFS_LOG(1, "Detaching consumer. Provider %s.", + ZFS_LOG(1, "Detaching from %s.", cp->provider && cp->provider->name ? cp->provider->name : "NULL"); vd = cp->private; cp->private = NULL; gp = cp->geom; if (open_for_read) g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { - ZFS_LOG(1, "Destroying consumer to %s.", + ZFS_LOG(1, "Destroying consumer for %s.", cp->provider->name ? cp->provider->name : "NULL"); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void vdev_geom_close_locked(vdev_t *vd) { struct g_consumer *cp; g_topology_assert(); cp = vd->vdev_tsd; vd->vdev_tsd = NULL; vd->vdev_delayed_close = B_FALSE; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); vdev_geom_detach(cp, B_TRUE); } -static void -nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid) -{ - - (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid); - (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid); -} - /* * Issue one or more bios to the vdev in parallel * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO * operation is described by parallel entries from each array. There may be * more bios actually issued than entries in the array */ static void vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, off_t *sizes, int *errors, int ncmds) { struct bio **bios; u_char *p; off_t off, maxio, s, end; int i, n_bios, j; size_t bios_size; maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); n_bios = 0; /* How many bios are required for all commands ? */ for (i = 0; i < ncmds; i++) n_bios += (sizes[i] + maxio - 1) / maxio; /* Allocate memory for the bios */ bios_size = n_bios * sizeof(struct bio*); bios = kmem_zalloc(bios_size, KM_SLEEP); /* Prepare and issue all of the bios */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; p = datas[i]; s = sizes[i]; end = off + s; ASSERT((off % cp->provider->sectorsize) == 0); ASSERT((s % cp->provider->sectorsize) == 0); for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { bios[j] = g_alloc_bio(); bios[j]->bio_cmd = cmds[i]; bios[j]->bio_done = NULL; bios[j]->bio_offset = off; bios[j]->bio_length = MIN(s, maxio); bios[j]->bio_data = p; g_io_request(bios[j], cp); } } ASSERT(j == n_bios); /* Wait for all of the bios to complete, and clean them up */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; s = sizes[i]; end = off + s; for (; off < end; off += maxio, s -= maxio, j++) { errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; g_destroy_bio(bios[j]); } } kmem_free(bios, bios_size); } static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) { struct g_provider *pp; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *p, *buf; size_t buflen; uint64_t psize, state, txg; off_t offsets[VDEV_LABELS]; off_t size; off_t sizes[VDEV_LABELS]; int cmds[VDEV_LABELS]; int errors[VDEV_LABELS]; int l, len; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); size = sizeof(*vdev_lists[0]) + pp->sectorsize - ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1; buflen = sizeof(vdev_lists[0]->vp_nvlist); *config = NULL; /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; vdev_lists[l] = kmem_alloc(size, KM_SLEEP); offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; sizes[l] = size; errors[l] = 0; ASSERT(offsets[l] % pp->sectorsize == 0); } /* Issue the IO requests */ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, VDEV_LABELS); /* Parse the labels */ for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) continue; buf = vdev_lists[l]->vp_nvlist; if (nvlist_unpack(buf, buflen, config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); *config = NULL; continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); *config = NULL; continue; } break; } /* Free the label storage */ for (l = 0; l < VDEV_LABELS; l++) kmem_free(vdev_lists[l], size); return (*config == NULL ? ENOENT : 0); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof(void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t* known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid, known_guid; uint64_t id, txg, known_txg; char *pname; int i; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { VERIFY(nvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int error; DROP_GIANT(); g_topology_lock(); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; zcp = vdev_geom_attach(pp, NULL); if (zcp == NULL) continue; g_topology_unlock(); error = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach(zcp, B_TRUE); if (error) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } -static void -vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid) -{ - nvlist_t *config; +enum match { + NO_MATCH, + TOP_MATCH, + FULL_MATCH +}; - g_topology_assert_not(); - - *pguid = 0; - *vguid = 0; - if (vdev_geom_read_config(cp, &config) == 0) { - nvlist_get_guids(config, pguid, vguid); - nvlist_free(config); - } -} - -static boolean_t +static enum match vdev_attach_ok(vdev_t *vd, struct g_provider *pp) { - uint64_t pool_guid; - uint64_t vdev_guid; - struct g_consumer *zcp; - boolean_t pool_ok; - boolean_t vdev_ok; + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; - zcp = vdev_geom_attach(pp, NULL); - if (zcp == NULL) { + cp = vdev_geom_attach(pp, NULL); + if (cp == NULL) { ZFS_LOG(1, "Unable to attach tasting instance to %s.", pp->name); - return (B_FALSE); + return (NO_MATCH); } g_topology_unlock(); - vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid); + if (vdev_geom_read_config(cp, &config) != 0) { + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } g_topology_lock(); - vdev_geom_detach(zcp, B_TRUE); + vdev_geom_detach(cp, B_TRUE); - /* - * Check that the label's vdev guid matches the desired guid. If the - * label has a pool guid, check that it matches too. (Inactive spares - * and L2ARCs do not have any pool guid in the label.) + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. */ - if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) && - vdev_guid == vd->vdev_guid) { - ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path); - return (B_TRUE); - } else { - ZFS_LOG(1, "guid mismatch for provider %s: " - "%ju:%ju != %ju:%ju.", vd->vdev_path, - (uintmax_t)spa_guid(vd->vdev_spa), - (uintmax_t)vd->vdev_guid, - (uintmax_t)pool_guid, (uintmax_t)vdev_guid); - return (B_FALSE); + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (FULL_MATCH); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOP_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *cp; + enum match m; g_topology_assert(); cp = NULL; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { - if (!vdev_attach_ok(vd, pp)) + m = vdev_attach_ok(vd, pp); + if (m == NO_MATCH) continue; + if (cp != NULL) { + if (m == FULL_MATCH) + vdev_geom_detach(cp, B_TRUE); + else + continue; + } cp = vdev_geom_attach(pp, vd); if (cp == NULL) { printf("ZFS WARNING: Unable to " "attach to %s.\n", pp->name); continue; } - break; + if (m == FULL_MATCH) + return (cp); } - if (cp != NULL) - break; } - if (cp != NULL) - break; } -end: return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guids [%ju:%ju].", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, vd->vdev_path); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); - if (!check_guid || vdev_attach_ok(vd, pp)) + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) cp = vdev_geom_attach(pp, vd); } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; size_t bufsize; int error; /* Set the TLS to indicate downstack that we should not access zvols*/ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } vd->vdev_tsd = NULL; DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_splitting_newspa || (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user knows what he/she is doing and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } /* Clear the TLS now that tasting is done */ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; } else if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", vd->vdev_path); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", vd->vdev_path, error); vdev_geom_close_locked(vd); cp = NULL; } } /* Fetch initial physical path information for this device. */ if (cp != NULL) vdev_geom_attrchanged(cp, "GEOM::physpath"); g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; /* * Determine the device's rotation rate. */ vdev_geom_set_rotation_rate(vd, cp); return (0); } static void vdev_geom_close(vdev_t *vd) { DROP_GIANT(); g_topology_lock(); vdev_geom_close_locked(vd); g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch(zio->io_error) { case ENOTSUP: /* * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know * that future attempts will never succeed. In this case * we set a persistent flag so that we don't bother with * requests in the future. */ switch(bp->bio_cmd) { case BIO_FLUSH: vd->vdev_nowritecache = B_TRUE; break; case BIO_DELETE: vd->vdev_notrim = B_TRUE; break; } break; case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } g_destroy_bio(bp); zio_delay_interrupt(zio); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; int error; vd = zio->io_vd; switch (zio->io_type) { case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } else { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } goto sendreq; default: zio->io_error = SET_ERROR(ENOTSUP); } } zio_execute(zio); return; case ZIO_TYPE_FREE: if (vd->vdev_notrim) { zio->io_error = SET_ERROR(ENOTSUP); } else if (!vdev_geom_bio_delete_disable) { goto sendreq; } zio_execute(zio); return; } sendreq: ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE || zio->io_type == ZIO_TYPE_IOCTL); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; bp->bio_data = zio->io_data; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_FREE: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; } bp->bio_done = vdev_geom_io_intr; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_geom_ops = { vdev_geom_open, vdev_geom_close, vdev_default_asize, vdev_geom_io_start, vdev_geom_io_done, NULL, vdev_geom_hold, vdev_geom_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c (revision 308054) @@ -1,191 +1,187 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright 2016 The MathWorks, Inc. All rights reserved. */ /* * A Zero Reference Lock (ZRL) is a reference count that can lock out new * references only when the count is zero and only without waiting if the count * is not already zero. It is similar to a read-write lock in that it allows * multiple readers and only a single writer, but it does not allow a writer to * block while waiting for readers to exit, and therefore the question of * reader/writer priority is moot (no WRWANT bit). Since the equivalent of * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it * is perfectly safe for the same reader to acquire the same lock multiple * times. The fact that a ZRL is reentrant for readers (through multiple calls * to zrl_add()) makes it convenient for determining whether something is * actively referenced without the fuss of flagging lock ownership across * function calls. */ #include /* * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is * treated as zero references. */ #define ZRL_LOCKED -1 #define ZRL_DESTROYED -2 void zrl_init(zrlock_t *zrl) { mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); zrl->zr_refcount = 0; cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); #ifdef ZFS_DEBUG zrl->zr_owner = NULL; zrl->zr_caller = NULL; #endif } void zrl_destroy(zrlock_t *zrl) { ASSERT0(zrl->zr_refcount); mutex_destroy(&zrl->zr_mtx); zrl->zr_refcount = ZRL_DESTROYED; cv_destroy(&zrl->zr_cv); } void zrl_add_impl(zrlock_t *zrl, const char *zc) { - uint32_t n = (uint32_t)zrl->zr_refcount; - - while (n != ZRL_LOCKED) { - uint32_t cas = atomic_cas_32( - (uint32_t *)&zrl->zr_refcount, n, n + 1); - if (cas == n) { - ASSERT3S((int32_t)n, >=, 0); + for (;;) { + uint32_t n = (uint32_t)zrl->zr_refcount; + while (n != ZRL_LOCKED) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, n, n + 1); + if (cas == n) { + ASSERT3S((int32_t)n, >=, 0); #ifdef ZFS_DEBUG - if (zrl->zr_owner == curthread) { - DTRACE_PROBE2(zrlock__reentry, - zrlock_t *, zrl, uint32_t, n); - } - zrl->zr_owner = curthread; - zrl->zr_caller = zc; + if (zrl->zr_owner == curthread) { + DTRACE_PROBE2(zrlock__reentry, + zrlock_t *, zrl, uint32_t, n); + } + zrl->zr_owner = curthread; + zrl->zr_caller = zc; #endif - return; + return; + } + n = cas; } - n = cas; - } - mutex_enter(&zrl->zr_mtx); - while (zrl->zr_refcount == ZRL_LOCKED) { - cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + mutex_enter(&zrl->zr_mtx); + while (zrl->zr_refcount == ZRL_LOCKED) { + cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + } + mutex_exit(&zrl->zr_mtx); } - ASSERT3S(zrl->zr_refcount, >=, 0); - zrl->zr_refcount++; -#ifdef ZFS_DEBUG - zrl->zr_owner = curthread; - zrl->zr_caller = zc; -#endif - mutex_exit(&zrl->zr_mtx); } void zrl_remove(zrlock_t *zrl) { uint32_t n; #ifdef ZFS_DEBUG if (zrl->zr_owner == curthread) { zrl->zr_owner = NULL; zrl->zr_caller = NULL; } #endif n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); ASSERT3S((int32_t)n, >=, 0); } int zrl_tryenter(zrlock_t *zrl) { uint32_t n = (uint32_t)zrl->zr_refcount; if (n == 0) { uint32_t cas = atomic_cas_32( (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); if (cas == 0) { #ifdef ZFS_DEBUG ASSERT3P(zrl->zr_owner, ==, NULL); zrl->zr_owner = curthread; #endif return (1); } } ASSERT3S((int32_t)n, >, ZRL_DESTROYED); return (0); } void zrl_exit(zrlock_t *zrl) { ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED); mutex_enter(&zrl->zr_mtx); #ifdef ZFS_DEBUG ASSERT3P(zrl->zr_owner, ==, curthread); zrl->zr_owner = NULL; membar_producer(); /* make sure the owner store happens first */ #endif zrl->zr_refcount = 0; cv_broadcast(&zrl->zr_cv); mutex_exit(&zrl->zr_mtx); } int zrl_refcount(zrlock_t *zrl) { ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); int n = (int)zrl->zr_refcount; return (n <= 0 ? 0 : n); } int zrl_is_zero(zrlock_t *zrl) { ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); return (zrl->zr_refcount <= 0); } int zrl_is_locked(zrlock_t *zrl) { ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); return (zrl->zr_refcount == ZRL_LOCKED); } #ifdef ZFS_DEBUG kthread_t * zrl_owner(zrlock_t *zrl) { return (zrl->zr_owner); } #endif Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 308054) Property changes on: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,2 ## Merged /head/sys/cddl/contrib/opensolaris:r307868-308053 Merged /vendor-sys/illumos/dist:r307993 Index: user/alc/PQ_LAUNDRY/sys/conf/files.amd64 =================================================================== --- user/alc/PQ_LAUNDRY/sys/conf/files.amd64 (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/conf/files.amd64 (revision 308054) @@ -1,670 +1,670 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # # cloudabi32_vdso.o optional compat_cloudabi32 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_i686_on_64bit.S" \ compile-with "${CC} -x assembler-with-cpp -m32 -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_i686_on_64bit.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi32_vdso.o" # cloudabi32_vdso_blob.o optional compat_cloudabi32 \ dependency "cloudabi32_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd --binary-architecture i386 cloudabi32_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi32_vdso_blob.o" # cloudabi64_vdso.o optional compat_cloudabi64 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_x86_64.S" \ compile-with "${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_x86_64.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi64_vdso.o" # cloudabi64_vdso_blob.o optional compat_cloudabi64 \ dependency "cloudabi64_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd --binary-architecture i386 cloudabi64_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi64_vdso_blob.o" # linux32_genassym.o optional compat_linux32 \ dependency "$S/amd64/linux32/linux32_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "linux32_genassym.o" # linux32_assym.h optional compat_linux32 \ dependency "$S/kern/genassym.sh linux32_genassym.o" \ compile-with "sh $S/kern/genassym.sh linux32_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "linux32_assym.h" # linux32_locore.o optional compat_linux32 \ dependency "linux32_assym.h $S/amd64/linux32/linux32_locore.s" \ compile-with "${CC} -x assembler-with-cpp -DLOCORE -m32 -shared -s -pipe -I. -I$S -Werror -Wall -fno-common -nostdinc -nostdlib -Wl,-T$S/amd64/linux32/linux32_vdso.lds.s -Wl,-soname=linux32_vdso.so,--eh-frame-hdr,-fPIC,-warn-common ${.IMPSRC} -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "linux32_locore.o" # linux32_vdso.so optional compat_linux32 \ dependency "linux32_locore.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd --binary-architecture i386 linux32_locore.o ${.TARGET}" \ no-implicit-rule \ clean "linux32_vdso.so" # ia32_genassym.o standard \ dependency "$S/compat/ia32/ia32_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "ia32_genassym.o" # ia32_assym.h standard \ dependency "$S/kern/genassym.sh ia32_genassym.o" \ compile-with "env NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "ia32_assym.h" # font.h optional sc_dflt_font \ compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ no-obj no-implicit-rule before-depend \ clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" # ukbdmap.h optional ukbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "ukbdmap.h" # hpt27xx_lib.o optional hpt27xx \ dependency "$S/dev/hpt27xx/amd64-elf.hpt27xx_lib.o.uu" \ compile-with "uudecode < $S/dev/hpt27xx/amd64-elf.hpt27xx_lib.o.uu" \ no-implicit-rule # hptmvraid.o optional hptmv \ dependency "$S/dev/hptmv/amd64-elf.raid.o.uu" \ compile-with "uudecode < $S/dev/hptmv/amd64-elf.raid.o.uu" \ no-implicit-rule # hptnr_lib.o optional hptnr \ dependency "$S/dev/hptnr/amd64-elf.hptnr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptnr/amd64-elf.hptnr_lib.o.uu" \ no-implicit-rule # hptrr_lib.o optional hptrr \ dependency "$S/dev/hptrr/amd64-elf.hptrr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptrr/amd64-elf.hptrr_lib.o.uu" \ no-implicit-rule # amd64/acpica/acpi_machdep.c optional acpi acpi_wakecode.o optional acpi \ dependency "$S/amd64/acpica/acpi_wakecode.S assym.s" \ compile-with "${NORMAL_S}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.o" acpi_wakecode.bin optional acpi \ dependency "acpi_wakecode.o" \ compile-with "${OBJCOPY} -S -O binary acpi_wakecode.o ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.bin" acpi_wakecode.h optional acpi \ dependency "acpi_wakecode.bin" \ compile-with "file2c -sx 'static char wakecode[] = {' '};' < acpi_wakecode.bin > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.h" acpi_wakedata.h optional acpi \ dependency "acpi_wakecode.o" \ compile-with '${NM} -n --defined-only acpi_wakecode.o | while read offset dummy what; do echo "#define $${what} 0x$${offset}"; done > ${.TARGET}' \ no-obj no-implicit-rule before-depend \ clean "acpi_wakedata.h" # amd64/amd64/amd64_mem.c optional mem #amd64/amd64/apic_vector.S standard amd64/amd64/atomic.c standard amd64/amd64/bios.c standard amd64/amd64/bpf_jit_machdep.c optional bpf_jitter amd64/amd64/cpu_switch.S standard amd64/amd64/db_disasm.c optional ddb amd64/amd64/db_interface.c optional ddb amd64/amd64/db_trace.c optional ddb amd64/amd64/efirt.c optional efirt amd64/amd64/elf_machdep.c standard amd64/amd64/exception.S standard amd64/amd64/fpu.c standard amd64/amd64/gdb_machdep.c optional gdb amd64/amd64/in_cksum.c optional inet | inet6 amd64/amd64/initcpu.c standard amd64/amd64/io.c optional io amd64/amd64/locore.S standard no-obj amd64/amd64/xen-locore.S optional xenhvm amd64/amd64/machdep.c standard amd64/amd64/mem.c optional mem amd64/amd64/minidump_machdep.c standard amd64/amd64/mp_machdep.c optional smp amd64/amd64/mpboot.S optional smp amd64/amd64/pmap.c standard amd64/amd64/prof_machdep.c optional profiling-routine amd64/amd64/ptrace_machdep.c standard amd64/amd64/sigtramp.S standard amd64/amd64/support.S standard amd64/amd64/sys_machdep.c standard amd64/amd64/trap.c standard amd64/amd64/uio_machdep.c standard amd64/amd64/uma_machdep.c standard amd64/amd64/vm_machdep.c standard amd64/cloudabi32/cloudabi32_sysvec.c optional compat_cloudabi32 amd64/cloudabi64/cloudabi64_sysvec.c optional compat_cloudabi64 amd64/pci/pci_cfgreg.c optional pci cddl/contrib/opensolaris/common/atomic/amd64/opensolaris_atomic.S optional zfs | dtrace compile-with "${ZFS_S}" cddl/dev/dtrace/amd64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/amd64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/x86/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" cddl/dev/dtrace/x86/dis_tables.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" cddl/dev/dtrace/x86/instr_size.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" crypto/aesni/aeskeys_amd64.S optional aesni crypto/aesni/aesni.c optional aesni aesni_ghash.o optional aesni \ dependency "$S/crypto/aesni/aesni_ghash.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_ghash.o" aesni_wrap.o optional aesni \ dependency "$S/crypto/aesni/aesni_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_wrap.o" crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock dev/acpica/acpi_if.m standard dev/acpica/acpi_hpet.c optional acpi dev/acpi_support/acpi_wmi_if.m standard dev/agp/agp_amd64.c optional agp dev/agp/agp_i810.c optional agp dev/agp/agp_via.c optional agp dev/amdsbwd/amdsbwd.c optional amdsbwd dev/amdtemp/amdtemp.c optional amdtemp dev/arcmsr/arcmsr.c optional arcmsr pci dev/asmc/asmc.c optional asmc isa dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/bxe/bxe.c optional bxe pci dev/bxe/bxe_stats.c optional bxe pci dev/bxe/bxe_debug.c optional bxe pci dev/bxe/ecore_sp.c optional bxe pci dev/bxe/bxe_elink.c optional bxe pci dev/bxe/57710_init_values.c optional bxe pci dev/bxe/57711_init_values.c optional bxe pci dev/bxe/57712_init_values.c optional bxe pci dev/coretemp/coretemp.c optional coretemp dev/cpuctl/cpuctl.c optional cpuctl dev/dpms/dpms.c optional dpms # There are no systems with isa slots, so all ed isa entries should go.. dev/ed/if_ed_3c503.c optional ed isa ed_3c503 dev/ed/if_ed_isa.c optional ed isa dev/ed/if_ed_wd80x3.c optional ed isa dev/ed/if_ed_hpp.c optional ed isa ed_hpp dev/ed/if_ed_sic.c optional ed isa ed_sic dev/fb/fb.c optional fb | vga dev/fb/s3_pci.c optional s3pci dev/fb/vesa.c optional vga vesa dev/fb/vga.c optional vga dev/ichwd/ichwd.c optional ichwd dev/if_ndis/if_ndis.c optional ndis dev/if_ndis/if_ndis_pccard.c optional ndis pccard dev/if_ndis/if_ndis_pci.c optional ndis cardbus | ndis pci dev/if_ndis/if_ndis_usb.c optional ndis usb dev/io/iodev.c optional io dev/ioat/ioat.c optional ioat pci dev/ioat/ioat_test.c optional ioat pci dev/ipmi/ipmi.c optional ipmi dev/ipmi/ipmi_acpi.c optional ipmi acpi dev/ipmi/ipmi_isa.c optional ipmi isa dev/ipmi/ipmi_kcs.c optional ipmi dev/ipmi/ipmi_smic.c optional ipmi dev/ipmi/ipmi_smbus.c optional ipmi smbus dev/ipmi/ipmi_smbios.c optional ipmi dev/ipmi/ipmi_ssif.c optional ipmi smbus dev/ipmi/ipmi_pci.c optional ipmi pci dev/ipmi/ipmi_linux.c optional ipmi compat_linux32 dev/ixl/if_ixl.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_main.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_qmgr.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_pf_iov.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/if_ixlv.c optional ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixlvc.c optional ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/ixl_txrx.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_osdep.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_lan_hmc.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_hmc.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_common.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_nvm.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_adminq.c optional ixl pci | ixlv pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/fdc/fdc.c optional fdc dev/fdc/fdc_acpi.c optional fdc dev/fdc/fdc_isa.c optional fdc isa dev/fdc/fdc_pccard.c optional fdc pccard dev/hpt27xx/hpt27xx_os_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_osm_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_config.c optional hpt27xx dev/hptmv/entry.c optional hptmv dev/hptmv/mv.c optional hptmv dev/hptmv/gui_lib.c optional hptmv dev/hptmv/hptproc.c optional hptmv dev/hptmv/ioctl.c optional hptmv dev/hptnr/hptnr_os_bsd.c optional hptnr dev/hptnr/hptnr_osm_bsd.c optional hptnr dev/hptnr/hptnr_config.c optional hptnr dev/hptrr/hptrr_os_bsd.c optional hptrr dev/hptrr/hptrr_osm_bsd.c optional hptrr dev/hptrr/hptrr_config.c optional hptrr dev/hwpmc/hwpmc_amd.c optional hwpmc dev/hwpmc/hwpmc_intel.c optional hwpmc dev/hwpmc/hwpmc_core.c optional hwpmc dev/hwpmc/hwpmc_uncore.c optional hwpmc dev/hwpmc/hwpmc_piv.c optional hwpmc dev/hwpmc/hwpmc_tsc.c optional hwpmc dev/hwpmc/hwpmc_x86.c optional hwpmc -dev/hyperv/netvsc/hv_net_vsc.c optional hyperv +dev/hyperv/netvsc/hn_nvs.c optional hyperv dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv dev/hyperv/utilities/hv_shutdown.c optional hyperv dev/hyperv/utilities/hv_timesync.c optional hyperv dev/hyperv/utilities/hv_util.c optional hyperv dev/hyperv/vmbus/hyperv.c optional hyperv dev/hyperv/vmbus/hyperv_busdma.c optional hyperv dev/hyperv/vmbus/vmbus.c optional hyperv dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/hyperv/vmbus/amd64/hyperv_machdep.c optional hyperv dev/hyperv/vmbus/amd64/vmbus_vector.S optional hyperv dev/nfe/if_nfe.c optional nfe pci dev/ntb/if_ntb/if_ntb.c optional if_ntb dev/ntb/ntb_transport.c optional if_ntb dev/ntb/ntb.c optional if_ntb | ntb_hw dev/ntb/ntb_if.m optional if_ntb | ntb_hw dev/ntb/ntb_hw/ntb_hw.c optional ntb_hw dev/nvd/nvd.c optional nvd nvme dev/nvme/nvme.c optional nvme dev/nvme/nvme_ctrlr.c optional nvme dev/nvme/nvme_ctrlr_cmd.c optional nvme dev/nvme/nvme_ns.c optional nvme dev/nvme/nvme_ns_cmd.c optional nvme dev/nvme/nvme_qpair.c optional nvme dev/nvme/nvme_sim.c optional nvme scbus !nvd dev/nvme/nvme_sysctl.c optional nvme dev/nvme/nvme_test.c optional nvme dev/nvme/nvme_util.c optional nvme dev/nvram/nvram.c optional nvram isa dev/random/ivy.c optional rdrand_rng dev/random/nehemiah.c optional padlock_rng dev/qlxge/qls_dbg.c optional qlxge pci dev/qlxge/qls_dump.c optional qlxge pci dev/qlxge/qls_hw.c optional qlxge pci dev/qlxge/qls_ioctl.c optional qlxge pci dev/qlxge/qls_isr.c optional qlxge pci dev/qlxge/qls_os.c optional qlxge pci dev/qlxgb/qla_dbg.c optional qlxgb pci dev/qlxgb/qla_hw.c optional qlxgb pci dev/qlxgb/qla_ioctl.c optional qlxgb pci dev/qlxgb/qla_isr.c optional qlxgb pci dev/qlxgb/qla_misc.c optional qlxgb pci dev/qlxgb/qla_os.c optional qlxgb pci dev/qlxgbe/ql_dbg.c optional qlxgbe pci dev/qlxgbe/ql_hw.c optional qlxgbe pci dev/qlxgbe/ql_ioctl.c optional qlxgbe pci dev/qlxgbe/ql_isr.c optional qlxgbe pci dev/qlxgbe/ql_misc.c optional qlxgbe pci dev/qlxgbe/ql_os.c optional qlxgbe pci dev/qlxgbe/ql_reset.c optional qlxgbe pci dev/sfxge/common/ef10_ev.c optional sfxge pci dev/sfxge/common/ef10_filter.c optional sfxge pci dev/sfxge/common/ef10_intr.c optional sfxge pci dev/sfxge/common/ef10_mac.c optional sfxge pci dev/sfxge/common/ef10_mcdi.c optional sfxge pci dev/sfxge/common/ef10_nic.c optional sfxge pci dev/sfxge/common/ef10_nvram.c optional sfxge pci dev/sfxge/common/ef10_phy.c optional sfxge pci dev/sfxge/common/ef10_rx.c optional sfxge pci dev/sfxge/common/ef10_tx.c optional sfxge pci dev/sfxge/common/ef10_vpd.c optional sfxge pci dev/sfxge/common/efx_bootcfg.c optional sfxge pci dev/sfxge/common/efx_crc32.c optional sfxge pci dev/sfxge/common/efx_ev.c optional sfxge pci dev/sfxge/common/efx_filter.c optional sfxge pci dev/sfxge/common/efx_hash.c optional sfxge pci dev/sfxge/common/efx_intr.c optional sfxge pci dev/sfxge/common/efx_lic.c optional sfxge pci dev/sfxge/common/efx_mac.c optional sfxge pci dev/sfxge/common/efx_mcdi.c optional sfxge pci dev/sfxge/common/efx_mon.c optional sfxge pci dev/sfxge/common/efx_nic.c optional sfxge pci dev/sfxge/common/efx_nvram.c optional sfxge pci dev/sfxge/common/efx_phy.c optional sfxge pci dev/sfxge/common/efx_port.c optional sfxge pci dev/sfxge/common/efx_rx.c optional sfxge pci dev/sfxge/common/efx_sram.c optional sfxge pci dev/sfxge/common/efx_tx.c optional sfxge pci dev/sfxge/common/efx_vpd.c optional sfxge pci dev/sfxge/common/efx_wol.c optional sfxge pci dev/sfxge/common/hunt_nic.c optional sfxge pci dev/sfxge/common/hunt_phy.c optional sfxge pci dev/sfxge/common/mcdi_mon.c optional sfxge pci dev/sfxge/common/medford_nic.c optional sfxge pci dev/sfxge/common/siena_mac.c optional sfxge pci dev/sfxge/common/siena_mcdi.c optional sfxge pci dev/sfxge/common/siena_nic.c optional sfxge pci dev/sfxge/common/siena_nvram.c optional sfxge pci dev/sfxge/common/siena_phy.c optional sfxge pci dev/sfxge/common/siena_sram.c optional sfxge pci dev/sfxge/common/siena_vpd.c optional sfxge pci dev/sfxge/sfxge.c optional sfxge pci dev/sfxge/sfxge_dma.c optional sfxge pci dev/sfxge/sfxge_ev.c optional sfxge pci dev/sfxge/sfxge_intr.c optional sfxge pci dev/sfxge/sfxge_mcdi.c optional sfxge pci dev/sfxge/sfxge_nvram.c optional sfxge pci dev/sfxge/sfxge_port.c optional sfxge pci dev/sfxge/sfxge_rx.c optional sfxge pci dev/sfxge/sfxge_tx.c optional sfxge pci dev/sio/sio.c optional sio dev/sio/sio_isa.c optional sio isa dev/sio/sio_pccard.c optional sio pccard dev/sio/sio_pci.c optional sio pci dev/sio/sio_puc.c optional sio puc dev/speaker/spkr.c optional speaker dev/syscons/apm/apm_saver.c optional apm_saver apm dev/syscons/scterm-teken.c optional sc dev/syscons/scvesactl.c optional sc vga vesa dev/syscons/scvgarndr.c optional sc vga dev/syscons/scvtb.c optional sc dev/tpm/tpm.c optional tpm dev/tpm/tpm_acpi.c optional tpm acpi dev/tpm/tpm_isa.c optional tpm isa dev/uart/uart_cpu_x86.c optional uart dev/viawd/viawd.c optional viawd dev/vmware/vmxnet3/if_vmx.c optional vmx dev/wbwd/wbwd.c optional wbwd dev/wpi/if_wpi.c optional wpi dev/xen/pci/xen_acpi_pci.c optional xenhvm dev/xen/pci/xen_pci.c optional xenhvm dev/isci/isci.c optional isci dev/isci/isci_controller.c optional isci dev/isci/isci_domain.c optional isci dev/isci/isci_interrupt.c optional isci dev/isci/isci_io_request.c optional isci dev/isci/isci_logger.c optional isci dev/isci/isci_oem_parameters.c optional isci dev/isci/isci_remote_device.c optional isci dev/isci/isci_sysctl.c optional isci dev/isci/isci_task_request.c optional isci dev/isci/isci_timer.c optional isci dev/isci/scil/sati.c optional isci dev/isci/scil/sati_abort_task_set.c optional isci dev/isci/scil/sati_atapi.c optional isci dev/isci/scil/sati_device.c optional isci dev/isci/scil/sati_inquiry.c optional isci dev/isci/scil/sati_log_sense.c optional isci dev/isci/scil/sati_lun_reset.c optional isci dev/isci/scil/sati_mode_pages.c optional isci dev/isci/scil/sati_mode_select.c optional isci dev/isci/scil/sati_mode_sense.c optional isci dev/isci/scil/sati_mode_sense_10.c optional isci dev/isci/scil/sati_mode_sense_6.c optional isci dev/isci/scil/sati_move.c optional isci dev/isci/scil/sati_passthrough.c optional isci dev/isci/scil/sati_read.c optional isci dev/isci/scil/sati_read_buffer.c optional isci dev/isci/scil/sati_read_capacity.c optional isci dev/isci/scil/sati_reassign_blocks.c optional isci dev/isci/scil/sati_report_luns.c optional isci dev/isci/scil/sati_request_sense.c optional isci dev/isci/scil/sati_start_stop_unit.c optional isci dev/isci/scil/sati_synchronize_cache.c optional isci dev/isci/scil/sati_test_unit_ready.c optional isci dev/isci/scil/sati_unmap.c optional isci dev/isci/scil/sati_util.c optional isci dev/isci/scil/sati_verify.c optional isci dev/isci/scil/sati_write.c optional isci dev/isci/scil/sati_write_and_verify.c optional isci dev/isci/scil/sati_write_buffer.c optional isci dev/isci/scil/sati_write_long.c optional isci dev/isci/scil/sci_abstract_list.c optional isci dev/isci/scil/sci_base_controller.c optional isci dev/isci/scil/sci_base_domain.c optional isci dev/isci/scil/sci_base_iterator.c optional isci dev/isci/scil/sci_base_library.c optional isci dev/isci/scil/sci_base_logger.c optional isci dev/isci/scil/sci_base_memory_descriptor_list.c optional isci dev/isci/scil/sci_base_memory_descriptor_list_decorator.c optional isci dev/isci/scil/sci_base_object.c optional isci dev/isci/scil/sci_base_observer.c optional isci dev/isci/scil/sci_base_phy.c optional isci dev/isci/scil/sci_base_port.c optional isci dev/isci/scil/sci_base_remote_device.c optional isci dev/isci/scil/sci_base_request.c optional isci dev/isci/scil/sci_base_state_machine.c optional isci dev/isci/scil/sci_base_state_machine_logger.c optional isci dev/isci/scil/sci_base_state_machine_observer.c optional isci dev/isci/scil/sci_base_subject.c optional isci dev/isci/scil/sci_util.c optional isci dev/isci/scil/scic_sds_controller.c optional isci dev/isci/scil/scic_sds_library.c optional isci dev/isci/scil/scic_sds_pci.c optional isci dev/isci/scil/scic_sds_phy.c optional isci dev/isci/scil/scic_sds_port.c optional isci dev/isci/scil/scic_sds_port_configuration_agent.c optional isci dev/isci/scil/scic_sds_remote_device.c optional isci dev/isci/scil/scic_sds_remote_node_context.c optional isci dev/isci/scil/scic_sds_remote_node_table.c optional isci dev/isci/scil/scic_sds_request.c optional isci dev/isci/scil/scic_sds_sgpio.c optional isci dev/isci/scil/scic_sds_smp_remote_device.c optional isci dev/isci/scil/scic_sds_smp_request.c optional isci dev/isci/scil/scic_sds_ssp_request.c optional isci dev/isci/scil/scic_sds_stp_packet_request.c optional isci dev/isci/scil/scic_sds_stp_remote_device.c optional isci dev/isci/scil/scic_sds_stp_request.c optional isci dev/isci/scil/scic_sds_unsolicited_frame_control.c optional isci dev/isci/scil/scif_sas_controller.c optional isci dev/isci/scil/scif_sas_controller_state_handlers.c optional isci dev/isci/scil/scif_sas_controller_states.c optional isci dev/isci/scil/scif_sas_domain.c optional isci dev/isci/scil/scif_sas_domain_state_handlers.c optional isci dev/isci/scil/scif_sas_domain_states.c optional isci dev/isci/scil/scif_sas_high_priority_request_queue.c optional isci dev/isci/scil/scif_sas_internal_io_request.c optional isci dev/isci/scil/scif_sas_io_request.c optional isci dev/isci/scil/scif_sas_io_request_state_handlers.c optional isci dev/isci/scil/scif_sas_io_request_states.c optional isci dev/isci/scil/scif_sas_library.c optional isci dev/isci/scil/scif_sas_remote_device.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substates.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substates.c optional isci dev/isci/scil/scif_sas_remote_device_state_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_states.c optional isci dev/isci/scil/scif_sas_request.c optional isci dev/isci/scil/scif_sas_smp_activity_clear_affiliation.c optional isci dev/isci/scil/scif_sas_smp_io_request.c optional isci dev/isci/scil/scif_sas_smp_phy.c optional isci dev/isci/scil/scif_sas_smp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_io_request.c optional isci dev/isci/scil/scif_sas_stp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_task_request.c optional isci dev/isci/scil/scif_sas_task_request.c optional isci dev/isci/scil/scif_sas_task_request_state_handlers.c optional isci dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/kern_clocksource.c standard kern/link_elf_obj.c standard # # IA32 binary support # #amd64/ia32/ia32_exception.S optional compat_freebsd32 amd64/ia32/ia32_reg.c optional compat_freebsd32 amd64/ia32/ia32_signal.c optional compat_freebsd32 amd64/ia32/ia32_sigtramp.S optional compat_freebsd32 amd64/ia32/ia32_syscall.c optional compat_freebsd32 amd64/ia32/ia32_misc.c optional compat_freebsd32 compat/ia32/ia32_sysvec.c optional compat_freebsd32 compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs # # Linux/i386 binary support # amd64/linux32/linux32_dummy.c optional compat_linux32 amd64/linux32/linux32_machdep.c optional compat_linux32 amd64/linux32/linux32_support.s optional compat_linux32 \ dependency "linux32_assym.h" amd64/linux32/linux32_sysent.c optional compat_linux32 amd64/linux32/linux32_sysvec.c optional compat_linux32 compat/linux/linux_emul.c optional compat_linux32 compat/linux/linux_file.c optional compat_linux32 compat/linux/linux_fork.c optional compat_linux32 compat/linux/linux_futex.c optional compat_linux32 compat/linux/linux_getcwd.c optional compat_linux32 compat/linux/linux_ioctl.c optional compat_linux32 compat/linux/linux_ipc.c optional compat_linux32 compat/linux/linux_mib.c optional compat_linux32 compat/linux/linux_misc.c optional compat_linux32 compat/linux/linux_mmap.c optional compat_linux32 compat/linux/linux_signal.c optional compat_linux32 compat/linux/linux_socket.c optional compat_linux32 compat/linux/linux_stats.c optional compat_linux32 compat/linux/linux_sysctl.c optional compat_linux32 compat/linux/linux_time.c optional compat_linux32 compat/linux/linux_timer.c optional compat_linux32 compat/linux/linux_uid16.c optional compat_linux32 compat/linux/linux_util.c optional compat_linux32 compat/linux/linux_vdso.c optional compat_linux32 compat/linux/linux_common.c optional compat_linux32 compat/linux/linux_event.c optional compat_linux32 compat/linux/linux.c optional compat_linux32 dev/amr/amr_linux.c optional compat_linux32 amr dev/mfi/mfi_linux.c optional compat_linux32 mfi # # Windows NDIS driver support # compat/ndis/kern_ndis.c optional ndisapi pci compat/ndis/kern_windrv.c optional ndisapi pci compat/ndis/subr_hal.c optional ndisapi pci compat/ndis/subr_ndis.c optional ndisapi pci compat/ndis/subr_ntoskrnl.c optional ndisapi pci compat/ndis/subr_pe.c optional ndisapi pci compat/ndis/subr_usbd.c optional ndisapi pci compat/ndis/winx64_wrap.S optional ndisapi pci # libkern/memmove.c standard libkern/memset.c standard # # x86 real mode BIOS emulator, required by dpms/pci/vesa # compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa contrib/x86emu/x86emu.c optional x86bios | dpms | pci | vesa # # bvm console # dev/bvm/bvm_console.c optional bvmconsole dev/bvm/bvm_dbg.c optional bvmdebug # # x86 shared code between IA32, AMD64 and PC98 architectures # x86/acpica/OsdEnvironment.c optional acpi x86/acpica/acpi_apm.c optional acpi x86/acpica/acpi_wakeup.c optional acpi x86/acpica/madt.c optional acpi x86/acpica/srat.c optional acpi x86/bios/smbios.c optional smbios x86/bios/vpd.c optional vpd x86/cpufreq/powernow.c optional cpufreq x86/cpufreq/est.c optional cpufreq x86/cpufreq/hwpstate.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci x86/iommu/intel_ctx.c optional acpi acpi_dmar pci x86/iommu/intel_drv.c optional acpi acpi_dmar pci x86/iommu/intel_fault.c optional acpi acpi_dmar pci x86/iommu/intel_gas.c optional acpi acpi_dmar pci x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci x86/iommu/intel_intrmap.c optional acpi acpi_dmar pci x86/iommu/intel_qi.c optional acpi acpi_dmar pci x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic isa x86/isa/atrtc.c standard x86/isa/clock.c standard x86/isa/elcr.c optional atpic isa | mptable x86/isa/isa.c standard x86/isa/isa_dma.c standard x86/isa/nmi.c standard x86/isa/orm.c optional isa x86/pci/pci_bus.c optional pci x86/pci/qpi.c optional pci x86/x86/autoconf.c standard x86/x86/bus_machdep.c standard x86/x86/busdma_bounce.c standard x86/x86/busdma_machdep.c standard x86/x86/cpu_machdep.c standard x86/x86/dump_machdep.c standard x86/x86/fdt_machdep.c optional fdt x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/io_apic.c standard x86/x86/legacy.c standard x86/x86/local_apic.c standard x86/x86/mca.c standard x86/x86/mptable.c optional mptable x86/x86/mptable_pci.c optional mptable pci x86/x86/mp_x86.c optional smp x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/msi.c optional pci x86/x86/nexus.c standard x86/x86/pvclock.c standard x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm x86/xen/xen_intr.c optional xenhvm x86/xen/pv.c optional xenhvm x86/xen/pvcpu_enum.c optional xenhvm x86/xen/xen_apic.c optional xenhvm x86/xen/xenpv.c optional xenhvm x86/xen/xen_nexus.c optional xenhvm x86/xen/xen_msi.c optional xenhvm x86/xen/xen_pci_bus.c optional xenhvm Index: user/alc/PQ_LAUNDRY/sys/conf/files.i386 =================================================================== --- user/alc/PQ_LAUNDRY/sys/conf/files.i386 (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/conf/files.i386 (revision 308054) @@ -1,638 +1,638 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # cloudabi32_vdso.o optional compat_cloudabi32 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_i686.S" \ compile-with "${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_i686.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi32_vdso.o" # cloudabi32_vdso_blob.o optional compat_cloudabi32 \ dependency "cloudabi32_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 cloudabi32_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi32_vdso_blob.o" # linux_genassym.o optional compat_linux \ dependency "$S/i386/linux/linux_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "linux_genassym.o" # linux_assym.h optional compat_linux \ dependency "$S/kern/genassym.sh linux_genassym.o" \ compile-with "sh $S/kern/genassym.sh linux_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "linux_assym.h" # linux_locore.o optional compat_linux \ dependency "linux_assym.h $S/i386/linux/linux_locore.s" \ compile-with "${CC} -x assembler-with-cpp -DLOCORE -shared -s -pipe -I. -I$S -Werror -Wall -fno-common -nostdinc -nostdlib -Wl,-T$S/i386/linux/linux_vdso.lds.s -Wl,-soname=linux_vdso.so,--eh-frame-hdr,-fPIC,-warn-common ${.IMPSRC} -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "linux_locore.o" # linux_vdso.so optional compat_linux \ dependency "linux_locore.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 linux_locore.o ${.TARGET}" \ no-implicit-rule \ clean "linux_vdso.so" # svr4_genassym.o optional compat_svr4 \ dependency "$S/i386/svr4/svr4_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "svr4_genassym.o" # svr4_assym.h optional compat_svr4 \ dependency "$S/kern/genassym.sh svr4_genassym.o" \ compile-with "sh $S/kern/genassym.sh svr4_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "svr4_assym.h" # font.h optional sc_dflt_font \ compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ no-obj no-implicit-rule before-depend \ clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" # ukbdmap.h optional ukbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "ukbdmap.h" # hpt27xx_lib.o optional hpt27xx \ dependency "$S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu" \ compile-with "uudecode < $S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu" \ no-implicit-rule # hptmvraid.o optional hptmv \ dependency "$S/dev/hptmv/i386-elf.raid.o.uu" \ compile-with "uudecode < $S/dev/hptmv/i386-elf.raid.o.uu" \ no-implicit-rule # hptnr_lib.o optional hptnr \ dependency "$S/dev/hptnr/i386-elf.hptnr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptnr/i386-elf.hptnr_lib.o.uu" \ no-implicit-rule # hptrr_lib.o optional hptrr \ dependency "$S/dev/hptrr/i386-elf.hptrr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptrr/i386-elf.hptrr_lib.o.uu" \ no-implicit-rule # cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs | dtrace compile-with "${ZFS_S}" cddl/dev/dtrace/i386/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/i386/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/x86/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" cddl/dev/dtrace/x86/dis_tables.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" cddl/dev/dtrace/x86/instr_size.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs compat/linux/linux_event.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux compat/linux/linux_futex.c optional compat_linux compat/linux/linux_getcwd.c optional compat_linux compat/linux/linux_ioctl.c optional compat_linux compat/linux/linux_ipc.c optional compat_linux compat/linux/linux_mib.c optional compat_linux compat/linux/linux_misc.c optional compat_linux compat/linux/linux_mmap.c optional compat_linux compat/linux/linux_signal.c optional compat_linux compat/linux/linux_socket.c optional compat_linux compat/linux/linux_stats.c optional compat_linux compat/linux/linux_sysctl.c optional compat_linux compat/linux/linux_time.c optional compat_linux compat/linux/linux_timer.c optional compat_linux compat/linux/linux_uid16.c optional compat_linux compat/linux/linux_util.c optional compat_linux compat/linux/linux_vdso.c optional compat_linux compat/linux/linux.c optional compat_linux compat/ndis/kern_ndis.c optional ndisapi pci compat/ndis/kern_windrv.c optional ndisapi pci compat/ndis/subr_hal.c optional ndisapi pci compat/ndis/subr_ndis.c optional ndisapi pci compat/ndis/subr_ntoskrnl.c optional ndisapi pci compat/ndis/subr_pe.c optional ndisapi pci compat/ndis/subr_usbd.c optional ndisapi pci compat/ndis/winx32_wrap.S optional ndisapi pci compat/svr4/imgact_svr4.c optional compat_svr4 compat/svr4/svr4_fcntl.c optional compat_svr4 compat/svr4/svr4_filio.c optional compat_svr4 compat/svr4/svr4_ioctl.c optional compat_svr4 compat/svr4/svr4_ipc.c optional compat_svr4 compat/svr4/svr4_misc.c optional compat_svr4 compat/svr4/svr4_resource.c optional compat_svr4 compat/svr4/svr4_signal.c optional compat_svr4 compat/svr4/svr4_socket.c optional compat_svr4 compat/svr4/svr4_sockio.c optional compat_svr4 compat/svr4/svr4_stat.c optional compat_svr4 compat/svr4/svr4_stream.c optional compat_svr4 compat/svr4/svr4_syscallnames.c optional compat_svr4 compat/svr4/svr4_sysent.c optional compat_svr4 compat/svr4/svr4_sysvec.c optional compat_svr4 compat/svr4/svr4_termios.c optional compat_svr4 bf_enc.o optional crypto | ipsec \ dependency "$S/crypto/blowfish/arch/i386/bf_enc.S $S/crypto/blowfish/arch/i386/bf_enc_586.S $S/crypto/blowfish/arch/i386/bf_enc_686.S" \ compile-with "${CC} -c -I$S/crypto/blowfish/arch/i386 ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}" \ no-implicit-rule crypto/aesni/aeskeys_i386.S optional aesni crypto/aesni/aesni.c optional aesni aesni_ghash.o optional aesni \ dependency "$S/crypto/aesni/aesni_ghash.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_ghash.o" aesni_wrap.o optional aesni \ dependency "$S/crypto/aesni/aesni_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_wrap.o" crypto/des/arch/i386/des_enc.S optional crypto | ipsec | netsmb crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock dev/advansys/adv_isa.c optional adv isa dev/agp/agp_ali.c optional agp dev/agp/agp_amd.c optional agp dev/agp/agp_amd64.c optional agp dev/agp/agp_ati.c optional agp dev/agp/agp_i810.c optional agp dev/agp/agp_intel.c optional agp dev/agp/agp_nvidia.c optional agp dev/agp/agp_sis.c optional agp dev/agp/agp_via.c optional agp dev/aic/aic_isa.c optional aic isa dev/amdsbwd/amdsbwd.c optional amdsbwd dev/amdtemp/amdtemp.c optional amdtemp dev/arcmsr/arcmsr.c optional arcmsr pci dev/asmc/asmc.c optional asmc isa dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/bxe/bxe.c optional bxe pci dev/bxe/bxe_stats.c optional bxe pci dev/bxe/bxe_debug.c optional bxe pci dev/bxe/ecore_sp.c optional bxe pci dev/bxe/bxe_elink.c optional bxe pci dev/bxe/57710_init_values.c optional bxe pci dev/bxe/57711_init_values.c optional bxe pci dev/bxe/57712_init_values.c optional bxe pci dev/ce/ceddk.c optional ce dev/ce/if_ce.c optional ce dev/ce/tau32-ddk.c optional ce \ compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}" dev/cm/if_cm_isa.c optional cm isa dev/coretemp/coretemp.c optional coretemp dev/cp/cpddk.c optional cp dev/cp/if_cp.c optional cp dev/cpuctl/cpuctl.c optional cpuctl dev/ctau/ctau.c optional ctau dev/ctau/ctddk.c optional ctau dev/ctau/if_ct.c optional ctau dev/cx/csigma.c optional cx dev/cx/cxddk.c optional cx dev/cx/if_cx.c optional cx dev/dpms/dpms.c optional dpms dev/ed/if_ed_3c503.c optional ed isa ed_3c503 dev/ed/if_ed_isa.c optional ed isa dev/ed/if_ed_wd80x3.c optional ed isa dev/ed/if_ed_hpp.c optional ed isa ed_hpp dev/ed/if_ed_sic.c optional ed isa ed_sic dev/fb/fb.c optional fb | vga dev/fb/s3_pci.c optional s3pci dev/fb/vesa.c optional vga vesa dev/fb/vga.c optional vga dev/fdc/fdc.c optional fdc dev/fdc/fdc_acpi.c optional fdc dev/fdc/fdc_isa.c optional fdc isa dev/fdc/fdc_pccard.c optional fdc pccard dev/fe/if_fe_isa.c optional fe isa dev/glxiic/glxiic.c optional glxiic dev/glxsb/glxsb.c optional glxsb dev/glxsb/glxsb_hash.c optional glxsb dev/hpt27xx/hpt27xx_os_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_osm_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_config.c optional hpt27xx dev/hptmv/entry.c optional hptmv dev/hptmv/mv.c optional hptmv dev/hptmv/gui_lib.c optional hptmv dev/hptmv/hptproc.c optional hptmv dev/hptmv/ioctl.c optional hptmv dev/hptnr/hptnr_os_bsd.c optional hptnr dev/hptnr/hptnr_osm_bsd.c optional hptnr dev/hptnr/hptnr_config.c optional hptnr dev/hptrr/hptrr_os_bsd.c optional hptrr dev/hptrr/hptrr_osm_bsd.c optional hptrr dev/hptrr/hptrr_config.c optional hptrr dev/hwpmc/hwpmc_amd.c optional hwpmc dev/hwpmc/hwpmc_intel.c optional hwpmc dev/hwpmc/hwpmc_core.c optional hwpmc dev/hwpmc/hwpmc_uncore.c optional hwpmc dev/hwpmc/hwpmc_pentium.c optional hwpmc dev/hwpmc/hwpmc_piv.c optional hwpmc dev/hwpmc/hwpmc_ppro.c optional hwpmc dev/hwpmc/hwpmc_tsc.c optional hwpmc dev/hwpmc/hwpmc_x86.c optional hwpmc -dev/hyperv/netvsc/hv_net_vsc.c optional hyperv +dev/hyperv/netvsc/hn_nvs.c optional hyperv dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c optional hyperv dev/hyperv/netvsc/hv_rndis_filter.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_heartbeat.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv dev/hyperv/utilities/hv_shutdown.c optional hyperv dev/hyperv/utilities/hv_timesync.c optional hyperv dev/hyperv/utilities/hv_util.c optional hyperv dev/hyperv/vmbus/hyperv.c optional hyperv dev/hyperv/vmbus/hyperv_busdma.c optional hyperv dev/hyperv/vmbus/vmbus.c optional hyperv dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/hyperv/vmbus/i386/hyperv_machdep.c optional hyperv dev/hyperv/vmbus/i386/vmbus_vector.S optional hyperv dev/ichwd/ichwd.c optional ichwd dev/if_ndis/if_ndis.c optional ndis dev/if_ndis/if_ndis_pccard.c optional ndis pccard dev/if_ndis/if_ndis_pci.c optional ndis cardbus | ndis pci dev/if_ndis/if_ndis_usb.c optional ndis usb dev/io/iodev.c optional io dev/ipmi/ipmi.c optional ipmi dev/ipmi/ipmi_acpi.c optional ipmi acpi dev/ipmi/ipmi_isa.c optional ipmi isa dev/ipmi/ipmi_kcs.c optional ipmi dev/ipmi/ipmi_smic.c optional ipmi dev/ipmi/ipmi_smbus.c optional ipmi smbus dev/ipmi/ipmi_smbios.c optional ipmi dev/ipmi/ipmi_ssif.c optional ipmi smbus dev/ipmi/ipmi_pci.c optional ipmi pci dev/ipmi/ipmi_linux.c optional ipmi compat_linux dev/le/if_le_isa.c optional le isa dev/mse/mse.c optional mse dev/mse/mse_isa.c optional mse isa dev/nfe/if_nfe.c optional nfe pci dev/ntb/if_ntb/if_ntb.c optional if_ntb dev/ntb/ntb_transport.c optional if_ntb dev/ntb/ntb.c optional if_ntb | ntb_hw dev/ntb/ntb_if.m optional if_ntb | ntb_hw dev/ntb/ntb_hw/ntb_hw.c optional ntb_hw dev/nvd/nvd.c optional nvd nvme dev/nvme/nvme.c optional nvme dev/nvme/nvme_ctrlr.c optional nvme dev/nvme/nvme_ctrlr_cmd.c optional nvme dev/nvme/nvme_ns.c optional nvme dev/nvme/nvme_ns_cmd.c optional nvme dev/nvme/nvme_qpair.c optional nvme dev/nvme/nvme_sysctl.c optional nvme dev/nvme/nvme_test.c optional nvme dev/nvme/nvme_util.c optional nvme dev/nvram/nvram.c optional nvram isa dev/ofw/ofwpci.c optional fdt pci dev/pcf/pcf_isa.c optional pcf dev/random/ivy.c optional rdrand_rng dev/random/nehemiah.c optional padlock_rng dev/sbni/if_sbni.c optional sbni dev/sbni/if_sbni_isa.c optional sbni isa dev/sbni/if_sbni_pci.c optional sbni pci dev/sio/sio.c optional sio dev/sio/sio_isa.c optional sio isa dev/sio/sio_pccard.c optional sio pccard dev/sio/sio_pci.c optional sio pci dev/sio/sio_puc.c optional sio puc dev/speaker/spkr.c optional speaker dev/syscons/apm/apm_saver.c optional apm_saver apm dev/syscons/scterm-teken.c optional sc dev/syscons/scvesactl.c optional sc vga vesa dev/syscons/scvgarndr.c optional sc vga dev/syscons/scvtb.c optional sc dev/tpm/tpm.c optional tpm dev/tpm/tpm_acpi.c optional tpm acpi dev/tpm/tpm_isa.c optional tpm isa dev/uart/uart_cpu_x86.c optional uart dev/viawd/viawd.c optional viawd dev/vmware/vmxnet3/if_vmx.c optional vmx dev/acpica/acpi_if.m standard dev/acpica/acpi_hpet.c optional acpi dev/acpi_support/acpi_wmi_if.m standard dev/wbwd/wbwd.c optional wbwd dev/wpi/if_wpi.c optional wpi dev/isci/isci.c optional isci dev/isci/isci_controller.c optional isci dev/isci/isci_domain.c optional isci dev/isci/isci_interrupt.c optional isci dev/isci/isci_io_request.c optional isci dev/isci/isci_logger.c optional isci dev/isci/isci_oem_parameters.c optional isci dev/isci/isci_remote_device.c optional isci dev/isci/isci_sysctl.c optional isci dev/isci/isci_task_request.c optional isci dev/isci/isci_timer.c optional isci dev/isci/scil/sati.c optional isci dev/isci/scil/sati_abort_task_set.c optional isci dev/isci/scil/sati_atapi.c optional isci dev/isci/scil/sati_device.c optional isci dev/isci/scil/sati_inquiry.c optional isci dev/isci/scil/sati_log_sense.c optional isci dev/isci/scil/sati_lun_reset.c optional isci dev/isci/scil/sati_mode_pages.c optional isci dev/isci/scil/sati_mode_select.c optional isci dev/isci/scil/sati_mode_sense.c optional isci dev/isci/scil/sati_mode_sense_10.c optional isci dev/isci/scil/sati_mode_sense_6.c optional isci dev/isci/scil/sati_move.c optional isci dev/isci/scil/sati_passthrough.c optional isci dev/isci/scil/sati_read.c optional isci dev/isci/scil/sati_read_buffer.c optional isci dev/isci/scil/sati_read_capacity.c optional isci dev/isci/scil/sati_reassign_blocks.c optional isci dev/isci/scil/sati_report_luns.c optional isci dev/isci/scil/sati_request_sense.c optional isci dev/isci/scil/sati_start_stop_unit.c optional isci dev/isci/scil/sati_synchronize_cache.c optional isci dev/isci/scil/sati_test_unit_ready.c optional isci dev/isci/scil/sati_unmap.c optional isci dev/isci/scil/sati_util.c optional isci dev/isci/scil/sati_verify.c optional isci dev/isci/scil/sati_write.c optional isci dev/isci/scil/sati_write_and_verify.c optional isci dev/isci/scil/sati_write_buffer.c optional isci dev/isci/scil/sati_write_long.c optional isci dev/isci/scil/sci_abstract_list.c optional isci dev/isci/scil/sci_base_controller.c optional isci dev/isci/scil/sci_base_domain.c optional isci dev/isci/scil/sci_base_iterator.c optional isci dev/isci/scil/sci_base_library.c optional isci dev/isci/scil/sci_base_logger.c optional isci dev/isci/scil/sci_base_memory_descriptor_list.c optional isci dev/isci/scil/sci_base_memory_descriptor_list_decorator.c optional isci dev/isci/scil/sci_base_object.c optional isci dev/isci/scil/sci_base_observer.c optional isci dev/isci/scil/sci_base_phy.c optional isci dev/isci/scil/sci_base_port.c optional isci dev/isci/scil/sci_base_remote_device.c optional isci dev/isci/scil/sci_base_request.c optional isci dev/isci/scil/sci_base_state_machine.c optional isci dev/isci/scil/sci_base_state_machine_logger.c optional isci dev/isci/scil/sci_base_state_machine_observer.c optional isci dev/isci/scil/sci_base_subject.c optional isci dev/isci/scil/sci_util.c optional isci dev/isci/scil/scic_sds_controller.c optional isci dev/isci/scil/scic_sds_library.c optional isci dev/isci/scil/scic_sds_pci.c optional isci dev/isci/scil/scic_sds_phy.c optional isci dev/isci/scil/scic_sds_port.c optional isci dev/isci/scil/scic_sds_port_configuration_agent.c optional isci dev/isci/scil/scic_sds_remote_device.c optional isci dev/isci/scil/scic_sds_remote_node_context.c optional isci dev/isci/scil/scic_sds_remote_node_table.c optional isci dev/isci/scil/scic_sds_request.c optional isci dev/isci/scil/scic_sds_sgpio.c optional isci dev/isci/scil/scic_sds_smp_remote_device.c optional isci dev/isci/scil/scic_sds_smp_request.c optional isci dev/isci/scil/scic_sds_ssp_request.c optional isci dev/isci/scil/scic_sds_stp_packet_request.c optional isci dev/isci/scil/scic_sds_stp_remote_device.c optional isci dev/isci/scil/scic_sds_stp_request.c optional isci dev/isci/scil/scic_sds_unsolicited_frame_control.c optional isci dev/isci/scil/scif_sas_controller.c optional isci dev/isci/scil/scif_sas_controller_state_handlers.c optional isci dev/isci/scil/scif_sas_controller_states.c optional isci dev/isci/scil/scif_sas_domain.c optional isci dev/isci/scil/scif_sas_domain_state_handlers.c optional isci dev/isci/scil/scif_sas_domain_states.c optional isci dev/isci/scil/scif_sas_high_priority_request_queue.c optional isci dev/isci/scil/scif_sas_internal_io_request.c optional isci dev/isci/scil/scif_sas_io_request.c optional isci dev/isci/scil/scif_sas_io_request_state_handlers.c optional isci dev/isci/scil/scif_sas_io_request_states.c optional isci dev/isci/scil/scif_sas_library.c optional isci dev/isci/scil/scif_sas_remote_device.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substates.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substates.c optional isci dev/isci/scil/scif_sas_remote_device_state_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_states.c optional isci dev/isci/scil/scif_sas_request.c optional isci dev/isci/scil/scif_sas_smp_activity_clear_affiliation.c optional isci dev/isci/scil/scif_sas_smp_io_request.c optional isci dev/isci/scil/scif_sas_smp_phy.c optional isci dev/isci/scil/scif_sas_smp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_io_request.c optional isci dev/isci/scil/scif_sas_stp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_task_request.c optional isci dev/isci/scil/scif_sas_task_request.c optional isci dev/isci/scil/scif_sas_task_request_state_handlers.c optional isci dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci i386/acpica/acpi_machdep.c optional acpi acpi_wakecode.o optional acpi \ dependency "$S/i386/acpica/acpi_wakecode.S assym.s" \ compile-with "${NORMAL_S}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.o" acpi_wakecode.bin optional acpi \ dependency "acpi_wakecode.o" \ compile-with "${OBJCOPY} -S -O binary acpi_wakecode.o ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.bin" acpi_wakecode.h optional acpi \ dependency "acpi_wakecode.bin" \ compile-with "file2c -sx 'static char wakecode[] = {' '};' < acpi_wakecode.bin > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.h" acpi_wakedata.h optional acpi \ dependency "acpi_wakecode.o" \ compile-with '${NM} -n --defined-only acpi_wakecode.o | while read offset dummy what; do echo "#define $${what} 0x$${offset}"; done > ${.TARGET}' \ no-obj no-implicit-rule before-depend \ clean "acpi_wakedata.h" # i386/bios/apm.c optional apm i386/bios/mca_machdep.c optional mca i386/bios/smapi.c optional smapi i386/bios/smapi_bios.S optional smapi i386/cloudabi32/cloudabi32_sysvec.c optional compat_cloudabi32 #i386/i386/apic_vector.s optional apic i386/i386/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/i386/bios.c standard i386/i386/bioscall.s standard i386/i386/bpf_jit_machdep.c optional bpf_jitter i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb i386/i386/db_trace.c optional ddb i386/i386/elan-mmcr.c optional cpu_elan | cpu_soekris i386/i386/elf_machdep.c standard i386/i386/exception.s standard i386/i386/gdb_machdep.c optional gdb i386/i386/geode.c optional cpu_geode i386/i386/i686_mem.c optional mem i386/i386/in_cksum.c optional inet | inet6 i386/i386/initcpu.c standard i386/i386/io.c optional io i386/i386/k6_mem.c optional mem i386/i386/locore.s standard no-obj i386/i386/longrun.c optional cpu_enable_longrun i386/i386/machdep.c standard i386/i386/mem.c optional mem i386/i386/minidump_machdep.c standard i386/i386/mp_clock.c optional smp i386/i386/mp_machdep.c optional smp i386/i386/mpboot.s optional smp i386/i386/perfmon.c optional perfmon i386/i386/pmap.c standard i386/i386/ptrace_machdep.c standard i386/i386/support.s standard i386/i386/swtch.s standard i386/i386/sys_machdep.c standard i386/i386/trap.c standard i386/i386/uio_machdep.c standard i386/i386/vm86.c standard i386/i386/vm_machdep.c standard i386/ibcs2/ibcs2_errno.c optional ibcs2 i386/ibcs2/ibcs2_fcntl.c optional ibcs2 i386/ibcs2/ibcs2_ioctl.c optional ibcs2 i386/ibcs2/ibcs2_ipc.c optional ibcs2 i386/ibcs2/ibcs2_isc.c optional ibcs2 i386/ibcs2/ibcs2_isc_sysent.c optional ibcs2 i386/ibcs2/ibcs2_misc.c optional ibcs2 i386/ibcs2/ibcs2_msg.c optional ibcs2 i386/ibcs2/ibcs2_other.c optional ibcs2 i386/ibcs2/ibcs2_signal.c optional ibcs2 i386/ibcs2/ibcs2_socksys.c optional ibcs2 i386/ibcs2/ibcs2_stat.c optional ibcs2 i386/ibcs2/ibcs2_sysent.c optional ibcs2 i386/ibcs2/ibcs2_sysi86.c optional ibcs2 i386/ibcs2/ibcs2_sysvec.c optional ibcs2 i386/ibcs2/ibcs2_util.c optional ibcs2 i386/ibcs2/ibcs2_xenix.c optional ibcs2 i386/ibcs2/ibcs2_xenix_sysent.c optional ibcs2 i386/ibcs2/imgact_coff.c optional ibcs2 i386/isa/elink.c optional ep | ie i386/isa/npx.c optional npx i386/isa/pmtimer.c optional pmtimer i386/isa/prof_machdep.c optional profiling-routine i386/linux/imgact_linux.c optional compat_linux i386/linux/linux_dummy.c optional compat_linux i386/linux/linux_machdep.c optional compat_linux i386/linux/linux_ptrace.c optional compat_linux i386/linux/linux_support.s optional compat_linux \ dependency "linux_assym.h" i386/linux/linux_sysent.c optional compat_linux i386/linux/linux_sysvec.c optional compat_linux i386/pci/pci_cfgreg.c optional pci i386/pci/pci_pir.c optional pci i386/svr4/svr4_locore.s optional compat_svr4 \ dependency "svr4_assym.h" \ warning "COMPAT_SVR4 is broken and should be avoided" i386/svr4/svr4_machdep.c optional compat_svr4 # isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/kern_clocksource.c standard kern/imgact_aout.c optional compat_aout kern/imgact_gzip.c optional gzip kern/subr_sfbuf.c standard libkern/divdi3.c standard libkern/ffsll.c standard libkern/flsll.c standard libkern/memmove.c standard libkern/memset.c standard libkern/moddi3.c standard libkern/qdivrem.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard i386/xbox/xbox.c optional xbox i386/xbox/xboxfb.c optional xboxfb dev/fb/boot_font.c optional xboxfb i386/xbox/pic16l.s optional xbox # # x86 real mode BIOS support, required by dpms/pci/vesa # compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa # # bvm console # dev/bvm/bvm_console.c optional bvmconsole dev/bvm/bvm_dbg.c optional bvmdebug # # x86 shared code between IA32, AMD64 and PC98 architectures # x86/acpica/OsdEnvironment.c optional acpi x86/acpica/acpi_apm.c optional acpi x86/acpica/acpi_wakeup.c optional acpi x86/acpica/madt.c optional acpi apic x86/acpica/srat.c optional acpi x86/bios/smbios.c optional smbios x86/bios/vpd.c optional vpd x86/cpufreq/est.c optional cpufreq x86/cpufreq/hwpstate.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq x86/cpufreq/powernow.c optional cpufreq x86/cpufreq/smist.c optional cpufreq x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci x86/iommu/intel_ctx.c optional acpi acpi_dmar pci x86/iommu/intel_drv.c optional acpi acpi_dmar pci x86/iommu/intel_fault.c optional acpi acpi_dmar pci x86/iommu/intel_gas.c optional acpi acpi_dmar pci x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci x86/iommu/intel_intrmap.c optional acpi acpi_dmar pci x86/iommu/intel_qi.c optional acpi acpi_dmar pci x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic x86/isa/atrtc.c standard x86/isa/clock.c standard x86/isa/elcr.c optional atpic | apic x86/isa/isa.c optional isa x86/isa/isa_dma.c optional isa x86/isa/nmi.c standard x86/isa/orm.c optional isa x86/pci/pci_bus.c optional pci x86/pci/qpi.c optional pci x86/x86/autoconf.c standard x86/x86/bus_machdep.c standard x86/x86/busdma_bounce.c standard x86/x86/busdma_machdep.c standard x86/x86/cpu_machdep.c standard x86/x86/dump_machdep.c standard x86/x86/fdt_machdep.c optional fdt x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/io_apic.c optional apic x86/x86/legacy.c standard x86/x86/local_apic.c optional apic x86/x86/mca.c standard x86/x86/mptable.c optional apic x86/x86/mptable_pci.c optional apic pci x86/x86/mp_x86.c optional smp x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/msi.c optional apic pci x86/x86/nexus.c standard x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/pvclock.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm x86/xen/xen_intr.c optional xenhvm x86/xen/xen_apic.c optional xenhvm x86/xen/xenpv.c optional xenhvm x86/xen/xen_nexus.c optional xenhvm x86/xen/xen_msi.c optional xenhvm Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.h (nonexistent) @@ -1,48 +0,0 @@ -/*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. - * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2012 NetApp Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef __HV_RNDIS_FILTER_H__ -#define __HV_RNDIS_FILTER_H__ - -#include -#include -#include - -/* - * Externs - */ -struct hn_rx_ring; - -void hv_rf_on_receive(struct hn_softc *sc, struct hn_rx_ring *rxr, - const void *data, int dlen); -void hv_rf_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr); - -#endif /* __HV_RNDIS_FILTER_H__ */ - Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.c (nonexistent) @@ -1,721 +0,0 @@ -/*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. - * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2012 NetApp Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/** - * HyperV vmbus network VSC (virtual services client) module - * - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper-V netvsc driver"); - -/* - * Forward declarations - */ -static int hn_nvs_conn_chim(struct hn_softc *sc); -static int hn_nvs_conn_rxbuf(struct hn_softc *); -static int hn_nvs_disconn_chim(struct hn_softc *sc); -static int hn_nvs_disconn_rxbuf(struct hn_softc *sc); -static void hn_nvs_sent_none(struct hn_send_ctx *sndc, - struct hn_softc *, struct vmbus_channel *chan, - const void *, int); - -struct hn_send_ctx hn_send_ctx_none = - HN_SEND_CTX_INITIALIZER(hn_nvs_sent_none, NULL); - -static const uint32_t hn_nvs_version[] = { - HN_NVS_VERSION_5, - HN_NVS_VERSION_4, - HN_NVS_VERSION_2, - HN_NVS_VERSION_1 -}; - -uint32_t -hn_chim_alloc(struct hn_softc *sc) -{ - int i, bmap_cnt = sc->hn_chim_bmap_cnt; - u_long *bmap = sc->hn_chim_bmap; - uint32_t ret = HN_NVS_CHIM_IDX_INVALID; - - for (i = 0; i < bmap_cnt; ++i) { - int idx; - - idx = ffsl(~bmap[i]); - if (idx == 0) - continue; - - --idx; /* ffsl is 1-based */ - KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, - ("invalid i %d and idx %d", i, idx)); - - if (atomic_testandset_long(&bmap[i], idx)) - continue; - - ret = i * LONG_BIT + idx; - break; - } - return (ret); -} - -static const void * -hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, - void *req, int reqlen, size_t *resplen0, uint32_t type) -{ - struct hn_send_ctx sndc; - size_t resplen, min_resplen = *resplen0; - const struct hn_nvs_hdr *hdr; - int error; - - KASSERT(min_resplen >= sizeof(*hdr), - ("invalid minimum response len %zu", min_resplen)); - - /* - * Execute the xact setup by the caller. - */ - hn_send_ctx_init(&sndc, hn_nvs_sent_xact, xact); - - vmbus_xact_activate(xact); - error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, - req, reqlen, &sndc); - if (error) { - vmbus_xact_deactivate(xact); - return (NULL); - } - hdr = vmbus_xact_wait(xact, &resplen); - - /* - * Check this NVS response message. - */ - if (resplen < min_resplen) { - if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); - return (NULL); - } - if (hdr->nvs_type != type) { - if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " - "expect 0x%08x\n", hdr->nvs_type, type); - return (NULL); - } - /* All pass! */ - *resplen0 = resplen; - return (hdr); -} - -static __inline int -hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) -{ - - return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, - req, reqlen, &hn_send_ctx_none)); -} - -static int -hn_nvs_conn_rxbuf(struct hn_softc *sc) -{ - struct vmbus_xact *xact = NULL; - struct hn_nvs_rxbuf_conn *conn; - const struct hn_nvs_rxbuf_connresp *resp; - size_t resp_len; - uint32_t status; - int error, rxbuf_size; - - /* - * Limit RXBUF size for old NVS. - */ - if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) - rxbuf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; - else - rxbuf_size = NETVSC_RECEIVE_BUFFER_SIZE; - - /* - * Connect the RXBUF GPADL to the primary channel. - * - * NOTE: - * Only primary channel has RXBUF connected to it. Sub-channels - * just share this RXBUF. - */ - error = vmbus_chan_gpadl_connect(sc->hn_prichan, - sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); - if (error) { - if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", - error); - goto cleanup; - } - - /* - * Connect RXBUF to NVS. - */ - - xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); - if (xact == NULL) { - if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); - error = ENXIO; - goto cleanup; - } - conn = vmbus_xact_req_data(xact); - conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; - conn->nvs_gpadl = sc->hn_rxbuf_gpadl; - conn->nvs_sig = HN_NVS_RXBUF_SIG; - - resp_len = sizeof(*resp); - resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, - HN_NVS_TYPE_RXBUF_CONNRESP); - if (resp == NULL) { - if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); - error = EIO; - goto cleanup; - } - - status = resp->nvs_status; - vmbus_xact_put(xact); - xact = NULL; - - if (status != HN_NVS_STATUS_OK) { - if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); - error = EIO; - goto cleanup; - } - sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; - - return (0); - -cleanup: - if (xact != NULL) - vmbus_xact_put(xact); - hn_nvs_disconn_rxbuf(sc); - return (error); -} - -static int -hn_nvs_conn_chim(struct hn_softc *sc) -{ - struct vmbus_xact *xact = NULL; - struct hn_nvs_chim_conn *chim; - const struct hn_nvs_chim_connresp *resp; - size_t resp_len; - uint32_t status, sectsz; - int error; - - /* - * Connect chimney sending buffer GPADL to the primary channel. - * - * NOTE: - * Only primary channel has chimney sending buffer connected to it. - * Sub-channels just share this chimney sending buffer. - */ - error = vmbus_chan_gpadl_connect(sc->hn_prichan, - sc->hn_chim_dma.hv_paddr, NETVSC_SEND_BUFFER_SIZE, - &sc->hn_chim_gpadl); - if (error) { - if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); - goto cleanup; - } - - /* - * Connect chimney sending buffer to NVS - */ - - xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); - if (xact == NULL) { - if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); - error = ENXIO; - goto cleanup; - } - chim = vmbus_xact_req_data(xact); - chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; - chim->nvs_gpadl = sc->hn_chim_gpadl; - chim->nvs_sig = HN_NVS_CHIM_SIG; - - resp_len = sizeof(*resp); - resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, - HN_NVS_TYPE_CHIM_CONNRESP); - if (resp == NULL) { - if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); - error = EIO; - goto cleanup; - } - - status = resp->nvs_status; - sectsz = resp->nvs_sectsz; - vmbus_xact_put(xact); - xact = NULL; - - if (status != HN_NVS_STATUS_OK) { - if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); - error = EIO; - goto cleanup; - } - if (sectsz == 0) { - if_printf(sc->hn_ifp, "zero chimney sending buffer " - "section size\n"); - return (0); - } - - sc->hn_chim_szmax = sectsz; - sc->hn_chim_cnt = NETVSC_SEND_BUFFER_SIZE / sc->hn_chim_szmax; - if (NETVSC_SEND_BUFFER_SIZE % sc->hn_chim_szmax != 0) { - if_printf(sc->hn_ifp, "chimney sending sections are " - "not properly aligned\n"); - } - if (sc->hn_chim_cnt % LONG_BIT != 0) { - if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", - sc->hn_chim_cnt % LONG_BIT); - } - - sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; - sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), - M_NETVSC, M_WAITOK | M_ZERO); - - /* Done! */ - sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; - if (bootverbose) { - if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", - sc->hn_chim_szmax, sc->hn_chim_cnt); - } - return (0); - -cleanup: - if (xact != NULL) - vmbus_xact_put(xact); - hn_nvs_disconn_chim(sc); - return (error); -} - -static int -hn_nvs_disconn_rxbuf(struct hn_softc *sc) -{ - int error; - - if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { - struct hn_nvs_rxbuf_disconn disconn; - - /* - * Disconnect RXBUF from NVS. - */ - memset(&disconn, 0, sizeof(disconn)); - disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; - disconn.nvs_sig = HN_NVS_RXBUF_SIG; - - /* NOTE: No response. */ - error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); - if (error) { - if_printf(sc->hn_ifp, - "send nvs rxbuf disconn failed: %d\n", error); - return (error); - } - sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; - - /* - * Wait for the hypervisor to receive this NVS request. - */ - while (!vmbus_chan_tx_empty(sc->hn_prichan)) - pause("waittx", 1); - /* - * Linger long enough for NVS to disconnect RXBUF. - */ - pause("lingtx", (200 * hz) / 1000); - } - - if (sc->hn_rxbuf_gpadl != 0) { - /* - * Disconnect RXBUF from primary channel. - */ - error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, - sc->hn_rxbuf_gpadl); - if (error) { - if_printf(sc->hn_ifp, - "rxbuf gpadl disconn failed: %d\n", error); - return (error); - } - sc->hn_rxbuf_gpadl = 0; - } - return (0); -} - -static int -hn_nvs_disconn_chim(struct hn_softc *sc) -{ - int error; - - if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { - struct hn_nvs_chim_disconn disconn; - - /* - * Disconnect chimney sending buffer from NVS. - */ - memset(&disconn, 0, sizeof(disconn)); - disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; - disconn.nvs_sig = HN_NVS_CHIM_SIG; - - /* NOTE: No response. */ - error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); - if (error) { - if_printf(sc->hn_ifp, - "send nvs chim disconn failed: %d\n", error); - return (error); - } - sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; - - /* - * Wait for the hypervisor to receive this NVS request. - */ - while (!vmbus_chan_tx_empty(sc->hn_prichan)) - pause("waittx", 1); - /* - * Linger long enough for NVS to disconnect chimney - * sending buffer. - */ - pause("lingtx", (200 * hz) / 1000); - } - - if (sc->hn_chim_gpadl != 0) { - /* - * Disconnect chimney sending buffer from primary channel. - */ - error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, - sc->hn_chim_gpadl); - if (error) { - if_printf(sc->hn_ifp, - "chim gpadl disconn failed: %d\n", error); - return (error); - } - sc->hn_chim_gpadl = 0; - } - - if (sc->hn_chim_bmap != NULL) { - free(sc->hn_chim_bmap, M_NETVSC); - sc->hn_chim_bmap = NULL; - } - return (0); -} - -static int -hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) -{ - struct vmbus_xact *xact; - struct hn_nvs_init *init; - const struct hn_nvs_init_resp *resp; - size_t resp_len; - uint32_t status; - - xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); - if (xact == NULL) { - if_printf(sc->hn_ifp, "no xact for nvs init\n"); - return (ENXIO); - } - init = vmbus_xact_req_data(xact); - init->nvs_type = HN_NVS_TYPE_INIT; - init->nvs_ver_min = nvs_ver; - init->nvs_ver_max = nvs_ver; - - resp_len = sizeof(*resp); - resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, - HN_NVS_TYPE_INIT_RESP); - if (resp == NULL) { - if_printf(sc->hn_ifp, "exec init failed\n"); - vmbus_xact_put(xact); - return (EIO); - } - - status = resp->nvs_status; - vmbus_xact_put(xact); - - if (status != HN_NVS_STATUS_OK) { - if (bootverbose) { - /* - * Caller may try another NVS version, and will log - * error if there are no more NVS versions to try, - * so don't bark out loud here. - */ - if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", - nvs_ver); - } - return (EINVAL); - } - return (0); -} - -/* - * Configure MTU and enable VLAN. - */ -static int -hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) -{ - struct hn_nvs_ndis_conf conf; - int error; - - memset(&conf, 0, sizeof(conf)); - conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; - conf.nvs_mtu = mtu; - conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; - - /* NOTE: No response. */ - error = hn_nvs_req_send(sc, &conf, sizeof(conf)); - if (error) { - if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); - return (error); - } - - if (bootverbose) - if_printf(sc->hn_ifp, "nvs ndis conf done\n"); - sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; - return (0); -} - -static int -hn_nvs_init_ndis(struct hn_softc *sc) -{ - struct hn_nvs_ndis_init ndis; - int error; - - memset(&ndis, 0, sizeof(ndis)); - ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; - ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); - ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); - - /* NOTE: No response. */ - error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); - if (error) - if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); - return (error); -} - -static int -hn_nvs_init(struct hn_softc *sc) -{ - int i, error; - - if (device_is_attached(sc->hn_dev)) { - /* - * NVS version and NDIS version MUST NOT be changed. - */ - if (bootverbose) { - if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " - "NDIS version %u.%u\n", sc->hn_nvs_ver, - HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), - HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); - } - - error = hn_nvs_doinit(sc, sc->hn_nvs_ver); - if (error) { - if_printf(sc->hn_ifp, "reinit NVS version 0x%x " - "failed: %d\n", sc->hn_nvs_ver, error); - return (error); - } - goto done; - } - - /* - * Find the supported NVS version and set NDIS version accordingly. - */ - for (i = 0; i < nitems(hn_nvs_version); ++i) { - error = hn_nvs_doinit(sc, hn_nvs_version[i]); - if (!error) { - sc->hn_nvs_ver = hn_nvs_version[i]; - - /* Set NDIS version according to NVS version. */ - sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; - if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) - sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; - - if (bootverbose) { - if_printf(sc->hn_ifp, "NVS version 0x%x, " - "NDIS version %u.%u\n", sc->hn_nvs_ver, - HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), - HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); - } - goto done; - } - } - if_printf(sc->hn_ifp, "no NVS available\n"); - return (ENXIO); - -done: - if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) - sc->hn_caps |= HN_CAP_HASHVAL; - return (0); -} - -int -hn_nvs_attach(struct hn_softc *sc, int mtu) -{ - int error; - - /* - * Initialize NVS. - */ - error = hn_nvs_init(sc); - if (error) - return (error); - - if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { - /* - * Configure NDIS before initializing it. - */ - error = hn_nvs_conf_ndis(sc, mtu); - if (error) - return (error); - } - - /* - * Initialize NDIS. - */ - error = hn_nvs_init_ndis(sc); - if (error) - return (error); - - /* - * Connect RXBUF. - */ - error = hn_nvs_conn_rxbuf(sc); - if (error) - return (error); - - /* - * Connect chimney sending buffer. - */ - error = hn_nvs_conn_chim(sc); - if (error) - return (error); - return (0); -} - -void -hn_nvs_detach(struct hn_softc *sc) -{ - - /* NOTE: there are no requests to stop the NVS. */ - hn_nvs_disconn_rxbuf(sc); - hn_nvs_disconn_chim(sc); -} - -void -hn_nvs_sent_xact(struct hn_send_ctx *sndc, - struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, - const void *data, int dlen) -{ - - vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); -} - -static void -hn_nvs_sent_none(struct hn_send_ctx *sndc __unused, - struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, - const void *data __unused, int dlen __unused) -{ - /* EMPTY */ -} - -void -hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) -{ - u_long mask; - uint32_t idx; - - idx = chim_idx / LONG_BIT; - KASSERT(idx < sc->hn_chim_bmap_cnt, - ("invalid chimney index 0x%x", chim_idx)); - - mask = 1UL << (chim_idx % LONG_BIT); - KASSERT(sc->hn_chim_bmap[idx] & mask, - ("index bitmap 0x%lx, chimney index %u, " - "bitmap idx %d, bitmask 0x%lx", - sc->hn_chim_bmap[idx], chim_idx, idx, mask)); - - atomic_clear_long(&sc->hn_chim_bmap[idx], mask); -} - -int -hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) -{ - struct vmbus_xact *xact; - struct hn_nvs_subch_req *req; - const struct hn_nvs_subch_resp *resp; - int error, nsubch_req; - uint32_t nsubch; - size_t resp_len; - - nsubch_req = *nsubch0; - KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); - - xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); - if (xact == NULL) { - if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); - return (ENXIO); - } - req = vmbus_xact_req_data(xact); - req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; - req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; - req->nvs_nsubch = nsubch_req; - - resp_len = sizeof(*resp); - resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, - HN_NVS_TYPE_SUBCH_RESP); - if (resp == NULL) { - if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); - error = EIO; - goto done; - } - if (resp->nvs_status != HN_NVS_STATUS_OK) { - if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", - resp->nvs_status); - error = EIO; - goto done; - } - - nsubch = resp->nvs_nsubch; - if (nsubch > nsubch_req) { - if_printf(sc->hn_ifp, "%u subchans are allocated, " - "requested %d\n", nsubch, nsubch_req); - nsubch = nsubch_req; - } - *nsubch0 = nsubch; - error = 0; -done: - vmbus_xact_put(xact); - return (error); -} Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.c ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.h (nonexistent) @@ -1,282 +0,0 @@ -/*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. - * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2012 NetApp Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * HyperV vmbus (virtual machine bus) network VSC (virtual services client) - * header file - * - * (Updated from unencumbered NvspProtocol.h) - */ - -#ifndef __HV_NET_VSC_H__ -#define __HV_NET_VSC_H__ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include - -#define HN_USE_TXDESC_BUFRING - -MALLOC_DECLARE(M_NETVSC); - -/* - * The following arguably belongs in a separate header file - */ - -/* - * Defines - */ - -#define NETVSC_SEND_BUFFER_SIZE (1024*1024*15) /* 15M */ - -#define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY (1024*1024*15) /* 15MB */ -#define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024*16) /* 16MB */ - -/* - * Maximum MTU we permit to be configured for a netvsc interface. - * When the code was developed, a max MTU of 12232 was tested and - * proven to work. 9K is a reasonable maximum for an Ethernet. - */ -#define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) - -#define NETVSC_PACKET_SIZE PAGE_SIZE - -/* - * Data types - */ - -struct vmbus_channel; - -#define NETVSC_DEVICE_RING_BUFFER_SIZE (128 * PAGE_SIZE) -#define NETVSC_PACKET_MAXPAGE 32 - -#define HN_XACT_REQ_PGCNT 2 -#define HN_XACT_RESP_PGCNT 2 -#define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) -#define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) - -struct hn_txdesc; -#ifndef HN_USE_TXDESC_BUFRING -SLIST_HEAD(hn_txdesc_list, hn_txdesc); -#else -struct buf_ring; -#endif - -struct hn_tx_ring; - -struct hn_rx_ring { - struct ifnet *hn_ifp; - struct hn_tx_ring *hn_txr; - void *hn_rdbuf; - uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ - int hn_rx_idx; - - /* Trust csum verification on host side */ - int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ - struct lro_ctrl hn_lro; - - u_long hn_csum_ip; - u_long hn_csum_tcp; - u_long hn_csum_udp; - u_long hn_csum_trusted; - u_long hn_lro_tried; - u_long hn_small_pkts; - u_long hn_pkts; - u_long hn_rss_pkts; - - /* Rarely used stuffs */ - struct sysctl_oid *hn_rx_sysctl_tree; - int hn_rx_flags; - - void *hn_br; /* TX/RX bufring */ - struct hyperv_dma hn_br_dma; -} __aligned(CACHE_LINE_SIZE); - -#define HN_TRUST_HCSUM_IP 0x0001 -#define HN_TRUST_HCSUM_TCP 0x0002 -#define HN_TRUST_HCSUM_UDP 0x0004 - -#define HN_RX_FLAG_ATTACHED 0x1 - -struct hn_tx_ring { -#ifndef HN_USE_TXDESC_BUFRING - struct mtx hn_txlist_spin; - struct hn_txdesc_list hn_txlist; -#else - struct buf_ring *hn_txdesc_br; -#endif - int hn_txdesc_cnt; - int hn_txdesc_avail; - u_short hn_has_txeof; - u_short hn_txdone_cnt; - - int hn_sched_tx; - void (*hn_txeof)(struct hn_tx_ring *); - struct taskqueue *hn_tx_taskq; - struct task hn_tx_task; - struct task hn_txeof_task; - - struct buf_ring *hn_mbuf_br; - int hn_oactive; - int hn_tx_idx; - int hn_tx_flags; - - struct mtx hn_tx_lock; - struct hn_softc *hn_sc; - struct vmbus_channel *hn_chan; - - int hn_direct_tx_size; - int hn_chim_size; - bus_dma_tag_t hn_tx_data_dtag; - uint64_t hn_csum_assist; - - int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); - int hn_suspended; - int hn_gpa_cnt; - struct vmbus_gpa hn_gpa[NETVSC_PACKET_MAXPAGE]; - - u_long hn_no_txdescs; - u_long hn_send_failed; - u_long hn_txdma_failed; - u_long hn_tx_collapsed; - u_long hn_tx_chimney_tried; - u_long hn_tx_chimney; - u_long hn_pkts; - - /* Rarely used stuffs */ - struct hn_txdesc *hn_txdesc; - bus_dma_tag_t hn_tx_rndis_dtag; - struct sysctl_oid *hn_tx_sysctl_tree; -} __aligned(CACHE_LINE_SIZE); - -#define HN_TX_FLAG_ATTACHED 0x1 -#define HN_TX_FLAG_HASHVAL 0x2 /* support HASHVAL pktinfo */ - -/* - * Device-specific softc structure - */ -struct hn_softc { - struct ifnet *hn_ifp; - struct ifmedia hn_media; - device_t hn_dev; - int hn_if_flags; - struct sx hn_lock; - struct vmbus_channel *hn_prichan; - - int hn_rx_ring_cnt; - int hn_rx_ring_inuse; - struct hn_rx_ring *hn_rx_ring; - - int hn_tx_ring_cnt; - int hn_tx_ring_inuse; - struct hn_tx_ring *hn_tx_ring; - - uint8_t *hn_chim; - u_long *hn_chim_bmap; - int hn_chim_bmap_cnt; - int hn_chim_cnt; - int hn_chim_szmax; - - int hn_cpu; - struct taskqueue *hn_tx_taskq; - struct sysctl_oid *hn_tx_sysctl_tree; - struct sysctl_oid *hn_rx_sysctl_tree; - struct vmbus_xact_ctx *hn_xact; - uint32_t hn_nvs_ver; - uint32_t hn_rx_filter; - - struct taskqueue *hn_mgmt_taskq; - struct taskqueue *hn_mgmt_taskq0; - struct task hn_link_task; - struct task hn_netchg_init; - struct timeout_task hn_netchg_status; - uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ - - uint32_t hn_caps; /* HN_CAP_ */ - uint32_t hn_flags; /* HN_FLAG_ */ - void *hn_rxbuf; - uint32_t hn_rxbuf_gpadl; - struct hyperv_dma hn_rxbuf_dma; - - uint32_t hn_chim_gpadl; - struct hyperv_dma hn_chim_dma; - - uint32_t hn_rndis_rid; - uint32_t hn_ndis_ver; - int hn_ndis_tso_szmax; - int hn_ndis_tso_sgmin; - - struct ndis_rssprm_toeplitz hn_rss; -}; - -#define HN_FLAG_RXBUF_CONNECTED 0x0001 -#define HN_FLAG_CHIM_CONNECTED 0x0002 -#define HN_FLAG_HAS_RSSKEY 0x0004 -#define HN_FLAG_HAS_RSSIND 0x0008 -#define HN_FLAG_SYNTH_ATTACHED 0x0010 - -#define HN_CAP_VLAN 0x0001 -#define HN_CAP_MTU 0x0002 -#define HN_CAP_IPCS 0x0004 -#define HN_CAP_TCP4CS 0x0008 -#define HN_CAP_TCP6CS 0x0010 -#define HN_CAP_UDP4CS 0x0020 -#define HN_CAP_UDP6CS 0x0040 -#define HN_CAP_TSO4 0x0080 -#define HN_CAP_TSO6 0x0100 -#define HN_CAP_HASHVAL 0x0200 - -#define HN_LINK_FLAG_LINKUP 0x0001 -#define HN_LINK_FLAG_NETCHG 0x0002 - -#endif /* __HV_NET_VSC_H__ */ - Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_net_vsc.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.c (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.c (revision 308054) @@ -0,0 +1,693 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Network Virtualization Service. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +static int hn_nvs_conn_chim(struct hn_softc *); +static int hn_nvs_conn_rxbuf(struct hn_softc *); +static int hn_nvs_disconn_chim(struct hn_softc *); +static int hn_nvs_disconn_rxbuf(struct hn_softc *); +static int hn_nvs_conf_ndis(struct hn_softc *, int); +static int hn_nvs_init_ndis(struct hn_softc *); +static int hn_nvs_doinit(struct hn_softc *, uint32_t); +static int hn_nvs_init(struct hn_softc *); +static const void *hn_nvs_xact_execute(struct hn_softc *, + struct vmbus_xact *, void *, int, + size_t *, uint32_t); +static void hn_nvs_sent_none(struct hn_nvs_sendctx *, + struct hn_softc *, struct vmbus_channel *, + const void *, int); + +struct hn_nvs_sendctx hn_nvs_sendctx_none = + HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL); + +static const uint32_t hn_nvs_version[] = { + HN_NVS_VERSION_5, + HN_NVS_VERSION_4, + HN_NVS_VERSION_2, + HN_NVS_VERSION_1 +}; + +static const void * +hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, + void *req, int reqlen, size_t *resplen0, uint32_t type) +{ + struct hn_nvs_sendctx sndc; + size_t resplen, min_resplen = *resplen0; + const struct hn_nvs_hdr *hdr; + int error; + + KASSERT(min_resplen >= sizeof(*hdr), + ("invalid minimum response len %zu", min_resplen)); + + /* + * Execute the xact setup by the caller. + */ + hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); + + vmbus_xact_activate(xact); + error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, + req, reqlen, &sndc); + if (error) { + vmbus_xact_deactivate(xact); + return (NULL); + } + hdr = vmbus_xact_wait(xact, &resplen); + + /* + * Check this NVS response message. + */ + if (resplen < min_resplen) { + if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); + return (NULL); + } + if (hdr->nvs_type != type) { + if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " + "expect 0x%08x\n", hdr->nvs_type, type); + return (NULL); + } + /* All pass! */ + *resplen0 = resplen; + return (hdr); +} + +static __inline int +hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) +{ + + return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, + req, reqlen, &hn_nvs_sendctx_none)); +} + +static int +hn_nvs_conn_rxbuf(struct hn_softc *sc) +{ + struct vmbus_xact *xact = NULL; + struct hn_nvs_rxbuf_conn *conn; + const struct hn_nvs_rxbuf_connresp *resp; + size_t resp_len; + uint32_t status; + int error, rxbuf_size; + + /* + * Limit RXBUF size for old NVS. + */ + if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) + rxbuf_size = HN_RXBUF_SIZE_COMPAT; + else + rxbuf_size = HN_RXBUF_SIZE; + + /* + * Connect the RXBUF GPADL to the primary channel. + * + * NOTE: + * Only primary channel has RXBUF connected to it. Sub-channels + * just share this RXBUF. + */ + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", + error); + goto cleanup; + } + + /* + * Connect RXBUF to NVS. + */ + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); + error = ENXIO; + goto cleanup; + } + conn = vmbus_xact_req_data(xact); + conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; + conn->nvs_gpadl = sc->hn_rxbuf_gpadl; + conn->nvs_sig = HN_NVS_RXBUF_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, + HN_NVS_TYPE_RXBUF_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); + error = EIO; + goto cleanup; + } + + status = resp->nvs_status; + vmbus_xact_put(xact); + xact = NULL; + + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); + error = EIO; + goto cleanup; + } + sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; + + return (0); + +cleanup: + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_rxbuf(sc); + return (error); +} + +static int +hn_nvs_conn_chim(struct hn_softc *sc) +{ + struct vmbus_xact *xact = NULL; + struct hn_nvs_chim_conn *chim; + const struct hn_nvs_chim_connresp *resp; + size_t resp_len; + uint32_t status, sectsz; + int error; + + /* + * Connect chimney sending buffer GPADL to the primary channel. + * + * NOTE: + * Only primary channel has chimney sending buffer connected to it. + * Sub-channels just share this chimney sending buffer. + */ + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); + goto cleanup; + } + + /* + * Connect chimney sending buffer to NVS + */ + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); + error = ENXIO; + goto cleanup; + } + chim = vmbus_xact_req_data(xact); + chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; + chim->nvs_gpadl = sc->hn_chim_gpadl; + chim->nvs_sig = HN_NVS_CHIM_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, + HN_NVS_TYPE_CHIM_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); + error = EIO; + goto cleanup; + } + + status = resp->nvs_status; + sectsz = resp->nvs_sectsz; + vmbus_xact_put(xact); + xact = NULL; + + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); + error = EIO; + goto cleanup; + } + if (sectsz == 0) { + if_printf(sc->hn_ifp, "zero chimney sending buffer " + "section size\n"); + return (0); + } + + sc->hn_chim_szmax = sectsz; + sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax; + if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) { + if_printf(sc->hn_ifp, "chimney sending sections are " + "not properly aligned\n"); + } + if (sc->hn_chim_cnt % LONG_BIT != 0) { + if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", + sc->hn_chim_cnt % LONG_BIT); + } + + sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; + sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), + M_DEVBUF, M_WAITOK | M_ZERO); + + /* Done! */ + sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; + if (bootverbose) { + if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", + sc->hn_chim_szmax, sc->hn_chim_cnt); + } + return (0); + +cleanup: + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_chim(sc); + return (error); +} + +static int +hn_nvs_disconn_rxbuf(struct hn_softc *sc) +{ + int error; + + if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { + struct hn_nvs_rxbuf_disconn disconn; + + /* + * Disconnect RXBUF from NVS. + */ + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; + disconn.nvs_sig = HN_NVS_RXBUF_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs rxbuf disconn failed: %d\n", error); + return (error); + } + sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; + + /* + * Wait for the hypervisor to receive this NVS request. + */ + while (!vmbus_chan_tx_empty(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect RXBUF. + */ + pause("lingtx", (200 * hz) / 1000); + } + + if (sc->hn_rxbuf_gpadl != 0) { + /* + * Disconnect RXBUF from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "rxbuf gpadl disconn failed: %d\n", error); + return (error); + } + sc->hn_rxbuf_gpadl = 0; + } + return (0); +} + +static int +hn_nvs_disconn_chim(struct hn_softc *sc) +{ + int error; + + if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { + struct hn_nvs_chim_disconn disconn; + + /* + * Disconnect chimney sending buffer from NVS. + */ + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; + disconn.nvs_sig = HN_NVS_CHIM_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs chim disconn failed: %d\n", error); + return (error); + } + sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; + + /* + * Wait for the hypervisor to receive this NVS request. + */ + while (!vmbus_chan_tx_empty(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect chimney + * sending buffer. + */ + pause("lingtx", (200 * hz) / 1000); + } + + if (sc->hn_chim_gpadl != 0) { + /* + * Disconnect chimney sending buffer from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "chim gpadl disconn failed: %d\n", error); + return (error); + } + sc->hn_chim_gpadl = 0; + } + + if (sc->hn_chim_bmap != NULL) { + free(sc->hn_chim_bmap, M_DEVBUF); + sc->hn_chim_bmap = NULL; + } + return (0); +} + +static int +hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) +{ + struct vmbus_xact *xact; + struct hn_nvs_init *init; + const struct hn_nvs_init_resp *resp; + size_t resp_len; + uint32_t status; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs init\n"); + return (ENXIO); + } + init = vmbus_xact_req_data(xact); + init->nvs_type = HN_NVS_TYPE_INIT; + init->nvs_ver_min = nvs_ver; + init->nvs_ver_max = nvs_ver; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, + HN_NVS_TYPE_INIT_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec init failed\n"); + vmbus_xact_put(xact); + return (EIO); + } + + status = resp->nvs_status; + vmbus_xact_put(xact); + + if (status != HN_NVS_STATUS_OK) { + if (bootverbose) { + /* + * Caller may try another NVS version, and will log + * error if there are no more NVS versions to try, + * so don't bark out loud here. + */ + if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", + nvs_ver); + } + return (EINVAL); + } + return (0); +} + +/* + * Configure MTU and enable VLAN. + */ +static int +hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) +{ + struct hn_nvs_ndis_conf conf; + int error; + + memset(&conf, 0, sizeof(conf)); + conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; + conf.nvs_mtu = mtu; + conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &conf, sizeof(conf)); + if (error) { + if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); + return (error); + } + + if (bootverbose) + if_printf(sc->hn_ifp, "nvs ndis conf done\n"); + sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; + return (0); +} + +static int +hn_nvs_init_ndis(struct hn_softc *sc) +{ + struct hn_nvs_ndis_init ndis; + int error; + + memset(&ndis, 0, sizeof(ndis)); + ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; + ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); + ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); + if (error) + if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); + return (error); +} + +static int +hn_nvs_init(struct hn_softc *sc) +{ + int i, error; + + if (device_is_attached(sc->hn_dev)) { + /* + * NVS version and NDIS version MUST NOT be changed. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + } + + error = hn_nvs_doinit(sc, sc->hn_nvs_ver); + if (error) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x " + "failed: %d\n", sc->hn_nvs_ver, error); + return (error); + } + goto done; + } + + /* + * Find the supported NVS version and set NDIS version accordingly. + */ + for (i = 0; i < nitems(hn_nvs_version); ++i) { + error = hn_nvs_doinit(sc, hn_nvs_version[i]); + if (!error) { + sc->hn_nvs_ver = hn_nvs_version[i]; + + /* Set NDIS version according to NVS version. */ + sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; + if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) + sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; + + if (bootverbose) { + if_printf(sc->hn_ifp, "NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + } + goto done; + } + } + if_printf(sc->hn_ifp, "no NVS available\n"); + return (ENXIO); + +done: + if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) + sc->hn_caps |= HN_CAP_HASHVAL; + return (0); +} + +int +hn_nvs_attach(struct hn_softc *sc, int mtu) +{ + int error; + + /* + * Initialize NVS. + */ + error = hn_nvs_init(sc); + if (error) + return (error); + + if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { + /* + * Configure NDIS before initializing it. + */ + error = hn_nvs_conf_ndis(sc, mtu); + if (error) + return (error); + } + + /* + * Initialize NDIS. + */ + error = hn_nvs_init_ndis(sc); + if (error) + return (error); + + /* + * Connect RXBUF. + */ + error = hn_nvs_conn_rxbuf(sc); + if (error) + return (error); + + /* + * Connect chimney sending buffer. + */ + error = hn_nvs_conn_chim(sc); + if (error) + return (error); + return (0); +} + +void +hn_nvs_detach(struct hn_softc *sc) +{ + + /* NOTE: there are no requests to stop the NVS. */ + hn_nvs_disconn_rxbuf(sc); + hn_nvs_disconn_chim(sc); +} + +void +hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data, int dlen) +{ + + vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); +} + +static void +hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data __unused, int dlen __unused) +{ + /* EMPTY */ +} + +int +hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) +{ + struct vmbus_xact *xact; + struct hn_nvs_subch_req *req; + const struct hn_nvs_subch_resp *resp; + int error, nsubch_req; + uint32_t nsubch; + size_t resp_len; + + nsubch_req = *nsubch0; + KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); + return (ENXIO); + } + req = vmbus_xact_req_data(xact); + req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; + req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; + req->nvs_nsubch = nsubch_req; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, + HN_NVS_TYPE_SUBCH_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); + error = EIO; + goto done; + } + if (resp->nvs_status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", + resp->nvs_status); + error = EIO; + goto done; + } + + nsubch = resp->nvs_nsubch; + if (nsubch > nsubch_req) { + if_printf(sc->hn_ifp, "%u subchans are allocated, " + "requested %d\n", nsubch, nsubch_req); + nsubch = nsubch_req; + } + *nsubch0 = nsubch; + error = 0; +done: + vmbus_xact_put(xact); + return (error); +} + +int +hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) +{ + + return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL, + sndc, gpa, gpa_cnt); +} Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.h (revision 308054) @@ -0,0 +1,106 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HN_NVS_H_ +#define _HN_NVS_H_ + +struct hn_nvs_sendctx; +struct vmbus_channel; +struct hn_softc; + +typedef void (*hn_nvs_sent_t) + (struct hn_nvs_sendctx *, struct hn_softc *, + struct vmbus_channel *, const void *, int); + +struct hn_nvs_sendctx { + hn_nvs_sent_t hn_cb; + void *hn_cbarg; +}; + +#define HN_NVS_SENDCTX_INITIALIZER(cb, cbarg) \ +{ \ + .hn_cb = cb, \ + .hn_cbarg = cbarg \ +} + +static __inline void +hn_nvs_sendctx_init(struct hn_nvs_sendctx *sndc, hn_nvs_sent_t cb, void *cbarg) +{ + + sndc->hn_cb = cb; + sndc->hn_cbarg = cbarg; +} + +static __inline int +hn_nvs_send(struct vmbus_channel *chan, uint16_t flags, + void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) +{ + + return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags, + nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); +} + +static __inline int +hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, + void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) +{ + + return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen, + (uint64_t)(uintptr_t)sndc)); +} + +static __inline int +hn_nvs_send_rndis_sglist(struct vmbus_channel *chan, uint32_t rndis_mtype, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) +{ + struct hn_nvs_rndis rndis; + + rndis.nvs_type = HN_NVS_TYPE_RNDIS; + rndis.nvs_rndis_mtype = rndis_mtype; + rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID; + rndis.nvs_chim_sz = 0; + + return (hn_nvs_send_sglist(chan, gpa, gpa_cnt, + &rndis, sizeof(rndis), sndc)); +} + +int hn_nvs_attach(struct hn_softc *sc, int mtu); +void hn_nvs_detach(struct hn_softc *sc); +int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch); +void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, + struct hn_softc *sc, struct vmbus_channel *chan, + const void *data, int dlen); +int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, + int gpa_cnt); + +extern struct hn_nvs_sendctx hn_nvs_sendctx_none; + +#endif /* !_HN_NVS_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_nvs.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_rndis.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_rndis.h (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_rndis.h (revision 308054) @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HN_RNDIS_H_ +#define _HN_RNDIS_H_ + +struct hn_softc; + +int hn_rndis_attach(struct hn_softc *sc, int mtu); +void hn_rndis_detach(struct hn_softc *sc); +int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags); +int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt); +int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr); +/* link_status: NDIS_MEDIA_STATE_ */ +int hn_rndis_get_linkstatus(struct hn_softc *sc, + uint32_t *link_status); +/* filter: NDIS_PACKET_TYPE_. */ +int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter); +void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, + int dlen); + +#endif /* !_HN_RNDIS_H_ */ Property changes on: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hn_rndis.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 308054) @@ -1,4234 +1,4649 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include -#include -#include #include +#include +#include +#include +#include #include "vmbus_if.h" /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" /* * It looks like offset 0 of buf is reserved to hold the softc pointer. * The sc pointer evidently not needed, and is not presently populated. * The packet offset is where the netvsc_packet starts in the buffer. */ #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_LROENT_CNT_DEF 128 #define HN_RING_CNT_DEF_MAX 8 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ -#define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1) +#define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 +#define HN_RXINFO_VLAN 0x0001 +#define HN_RXINFO_CSUM 0x0002 +#define HN_RXINFO_HASHINF 0x0004 +#define HN_RXINFO_HASHVAL 0x0008 +#define HN_RXINFO_ALL \ + (HN_RXINFO_VLAN | \ + HN_RXINFO_CSUM | \ + HN_RXINFO_HASHINF | \ + HN_RXINFO_HASHVAL) + struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ - struct hn_send_ctx send_ctx; + struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x1 #define HN_TXD_FLAG_DMAMAP 0x2 +#define HN_NDIS_VLAN_INFO_INVALID 0xffffffff +#define HN_NDIS_RXCSUM_INFO_INVALID 0 +#define HN_NDIS_HASH_INFO_INVALID 0 + +struct hn_rxinfo { + uint32_t vlan_info; + uint32_t csum_info; + uint32_t hash_info; + uint32_t hash_value; +}; + #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) /* * Globals */ SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_share_tx_taskq = 0; SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN, &hn_share_tx_taskq, 0, "Enable shared TX taskqueue"); static struct taskqueue *hn_tx_taskq; #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); static int hn_bind_tx_taskq = -1; SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif static u_int hn_cpu_index; /* * Forward declarations */ static void hn_stop(struct hn_softc *sc); static void hn_init_locked(struct hn_softc *sc); static void hn_init(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct hn_tx_ring *txr, int len); static void hn_start(struct ifnet *ifp); static void hn_start_txeof(struct hn_tx_ring *); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *, int); static void hn_destroy_tx_ring(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_start_taskfunc(void *, int); static void hn_start_txeof_taskfunc(void *, int); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_suspend_mgmt_taskfunc(void *, int); static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr); +static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_rx_drain(struct vmbus_channel *); static void hn_tx_resume(struct hn_softc *, int); static void hn_tx_ring_qflush(struct hn_tx_ring *); static int netvsc_detach(device_t dev); static void hn_link_status(struct hn_softc *); static int hn_sendpkt_rndis_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_sendpkt_rndis_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_set_rxfilter(struct hn_softc *); +static void hn_link_status_update(struct hn_softc *); +static void hn_network_change(struct hn_softc *); +static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); +static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); +static void hn_rndis_rx_status(struct hn_softc *, const void *, int); + static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt); static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt); -static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, +static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr); static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid); static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof_taskfunc(void *, int); static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif -static __inline int -hn_nvs_send_rndis_sglist1(struct vmbus_channel *chan, uint32_t rndis_mtype, - struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) -{ - struct hn_nvs_rndis rndis; - - rndis.nvs_type = HN_NVS_TYPE_RNDIS; - rndis.nvs_rndis_mtype = rndis_mtype; - rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID; - rndis.nvs_chim_sz = 0; - - return (hn_nvs_send_sglist(chan, gpa, gpa_cnt, - &rndis, sizeof(rndis), sndc)); -} - -int -hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, - struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) -{ - - return hn_nvs_send_rndis_sglist1(chan, HN_NVS_RNDIS_MTYPE_CTRL, - sndc, gpa, gpa_cnt); -} - static int hn_sendpkt_rndis_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); - return (hn_nvs_send_rndis_sglist1(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, + return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_sendpkt_rndis_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } +static __inline uint32_t +hn_chim_alloc(struct hn_softc *sc) +{ + int i, bmap_cnt = sc->hn_chim_bmap_cnt; + u_long *bmap = sc->hn_chim_bmap; + uint32_t ret = HN_NVS_CHIM_IDX_INVALID; + + for (i = 0; i < bmap_cnt; ++i) { + int idx; + + idx = ffsl(~bmap[i]); + if (idx == 0) + continue; + + --idx; /* ffsl is 1-based */ + KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, + ("invalid i %d and idx %d", i, idx)); + + if (atomic_testandset_long(&bmap[i], idx)) + continue; + + ret = i * LONG_BIT + idx; + break; + } + return (ret); +} + +static __inline void +hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) +{ + u_long mask; + uint32_t idx; + + idx = chim_idx / LONG_BIT; + KASSERT(idx < sc->hn_chim_bmap_cnt, + ("invalid chimney index 0x%x", chim_idx)); + + mask = 1UL << (chim_idx % LONG_BIT); + KASSERT(sc->hn_chim_bmap[idx] & mask, + ("index bitmap 0x%lx, chimney index %u, " + "bitmap idx %d, bitmask 0x%lx", + sc->hn_chim_bmap[idx], chim_idx, idx, mask)); + + atomic_clear_long(&sc->hn_chim_bmap[idx], mask); +} + static int hn_set_rxfilter(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; int error = 0; HN_LOCK_ASSERT(sc); if (ifp->if_flags & IFF_PROMISC) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; #ifdef notyet /* * See the comment in SIOCADDMULTI/SIOCDELMULTI. */ /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !TAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; #else /* Always enable ALLMULTI */ filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; #endif } if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc, int nchan) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ static const struct hyperv_guid g_net_vsc_device_type = { .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} }; /* * Standard probe entry point. * */ static int netvsc_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &g_net_vsc_device_type) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } /* * Standard attach entry point. * * Called when the driver is loaded. It allocates needed resources, * and initializes the "hardware" and software. */ static int netvsc_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); /* * Setup taskqueue for transmission. */ if (hn_tx_taskq == NULL) { sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1, PI_NET, &cpu_set, "%s tx", device_get_nameunit(dev)); } else { taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx", device_get_nameunit(dev)); } } else { sc->hn_tx_taskq = hn_tx_taskq; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) goto failed; /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif /* * Fixup TX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rss_hash_sysctl, "A", "RSS hash"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", + CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ ifp->if_baudrate = IF_Gbps(10); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_init; if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; } ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; if (ifp->if_hwassist & HN_CSUM_IP_MASK) ifp->if_capabilities |= IFCAP_TXCSUM; if (ifp->if_hwassist & HN_CSUM_IP6_MASK) ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; if (sc->hn_caps & HN_CAP_TSO4) { ifp->if_capabilities |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_IP_TSO; } if (sc->hn_caps & HN_CAP_TSO6) { ifp->if_capabilities |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_IP6_TSO; } /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; } ether_ifattach(ifp, eaddr); if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } /* Inform the upper layer about the long frame support. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_link_status_update(sc); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); netvsc_detach(dev); return (error); } static int netvsc_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp; if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskq != hn_tx_taskq) taskqueue_free(sc->hn_tx_taskq); taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_xact != NULL) vmbus_xact_ctx_destroy(sc->hn_xact); if_free(ifp); HN_LOCK_DESTROY(sc); return (0); } /* * Standard shutdown entry point */ static int netvsc_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } -void +static void hn_link_status_update(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } -void +static void hn_network_change(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else atomic_add_int(&txr->hn_txdesc_avail, 1); buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif KASSERT(txd->m == NULL && txd->refs == 0 && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void -hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc, +hn_tx_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } -void +static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } +static __inline void * +hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, + size_t pi_dlen, uint32_t pi_type) +{ + const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); + struct rndis_pktinfo *pi; + + KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, + ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); + + /* + * Per-packet-info does not move; it only grows. + * + * NOTE: + * rm_pktinfooffset in this phase counts from the beginning + * of rndis_packet_msg. + */ + KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, + ("%u pktinfo overflows RNDIS packet msg", pi_type)); + pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + + pkt->rm_pktinfolen); + pkt->rm_pktinfolen += pi_size; + + pi->rm_size = pi_size; + pi->rm_type = pi_type; + pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; + + /* Data immediately follow per-packet-info. */ + pkt->rm_dataoffset += pi_size; + + /* Update RNDIS packet msg length */ + pkt->rm_len += pi_size; + + return (pi->rm_data); +} + /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; int pktlen; /* * extension points to the area reserved for the * rndis_filter_packet, which is placed just after * the netvsc_packet (and rppi struct, if present; * length is updated later). */ pkt = txd->rndis_pkt; pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; pkt->rm_dataoffset = sizeof(*pkt); pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) struct ether_vlan_header *eh; int ether_len; /* * XXX need m_pullup and use mtodo */ eh = mtod(m_head, struct ether_vlan_header*); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ether_len = ETHER_HDR_LEN; pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip = (struct ip *)(m_head->m_data + ether_len); unsigned long iph_len = ip->ip_hl << 2; struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6 = (struct ip6_hdr *) (m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) *pi_data |= NDIS_TXCSUM_INFO_TCPCS; else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) *pi_data |= NDIS_TXCSUM_INFO_UDPCS; } pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Chimney send, if the packet could fit into one chimney buffer. */ if (pkt->rm_len < txr->hn_chim_size) { txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { uint8_t *dest = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); memcpy(dest, pkt, pktlen); dest += pktlen; m_copydata(m_head, 0, m_head->m_pkthdr.len, dest); txd->chim_size = pkt->rm_len; txr->hn_gpa_cnt = 0; txr->hn_tx_chimney++; txr->hn_sendpkt = hn_sendpkt_rndis_chim; goto done; } } error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (error) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pktlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_sendpkt_rndis_sglist; done: txd->m = m_head; /* Set the completion routine */ - hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd); + hn_nvs_sendctx_init(&txd->send_ctx, hn_tx_done, txd); return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0; again: /* * Make sure that txd is not freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); error = txr->hn_sendpkt(txr, txd); if (!error) { ETHER_BPF_MTAP(ifp, txd->m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (!hn_use_if_start) { if_inc_counter(ifp, IFCOUNTER_OBYTES, txd->m->m_pkthdr.len); if (txd->m->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } txr->hn_pkts++; } hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } return error; } /* * Start a transmit of one or more packets */ static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); if (__predict_false(txr->hn_suspended)) return 0; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return 0; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } return 0; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif -/* - * Called when we receive a data packet from the "wire" on the - * specified device - * - * Note: This is no longer used as a callback - */ -int +static int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, - const struct hn_recvinfo *info) + const struct hn_rxinfo *info) { struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; int hash_type; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); /* * Bail out if packet contains more data than configured MTU. */ if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); } else if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), data, dlen); m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } if ((info->csum_info & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); if (m_new->m_len < hoff) goto skip; eh = mtod(m_new, struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) goto skip; evl = mtod(m_new, struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } if (etype == ETHERTYPE_IP) { int pr; pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (pr == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (pr != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } skip: if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(info->vlan_info), NDIS_VLAN_INFO_PRI(info->vlan_info), NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = info->hash_value; hash_type = M_HASHTYPE_OPAQUE_HASH; if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->hn_pkts++; if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int mask, error = 0; switch (cmd) { case SIOCSIFMTU: - if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { + if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (ifp->if_mtu == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ ifp->if_mtu = ifr->ifr_mtu; /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); #endif /* * All done! Resume the interface now. */ hn_resume(sc); HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_set_rxfilter(sc); else hn_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc); } sc->hn_if_flags = ifp->if_flags; HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet /* * XXX * Multicast uses mutex, while RNDIS RX filter setting * sleeps. We workaround this by always enabling * ALLMULTI. ALLMULTI would actually always be on, even * if we supported the SIOCADDMULTI/SIOCDELMULTI, since * we don't support multicast address list configuration * for this driver. */ HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_set_rxfilter(sc); HN_UNLOCK(sc); #endif break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit _before_ hn_suspend_data() */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); hn_suspend_data(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; } /* * FreeBSD transmit entry point */ static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_init_locked(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_set_rxfilter(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_tx_resume(sc, sc->hn_tx_ring_inuse); /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } #ifdef LATER /* * */ static void hn_watchdog(struct ifnet *ifp) { if_printf(ifp, "watchdog timeout -- resetting\n"); hn_init(ifp->if_softc); /* XXX */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } #endif #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); - snprintf(caps_str, sizeof(caps_str), "%b", caps, - "\020" - "\001VLAN" - "\002MTU" - "\003IPCS" - "\004TCP4CS" - "\005TCP6CS" - "\006UDP4CS" - "\007UDP6CS" - "\010TSO4" - "\011TSO6" - "\012HASHVAL"); + snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = sc->hn_ifp->if_hwassist; HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } static int +hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char hash_str[128]; + uint32_t hash; + + HN_LOCK(sc); + hash = sc->hn_rss_hash; + HN_UNLOCK(sc); + snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); + return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); +} + +static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), - PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma, + PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, - M_NETVSC, M_WAITOK | M_ZERO); + M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), - PAGE_SIZE, 0, - NETVSC_DEVICE_RING_BUFFER_SIZE + - NETVSC_DEVICE_RING_BUFFER_SIZE, + PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, &rxr->hn_br_dma, BUS_DMA_WAITOK); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; - rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); + rxr->hn_pktbuf = malloc(HN_PKTBUF_LEN, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif - free(rxr->hn_rdbuf, M_NETVSC); + free(rxr->hn_pktbuf, M_DEVBUF); } - free(sc->hn_rx_ring, M_NETVSC); + free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_create_tx_ring(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, - M_NETVSC, M_WAITOK | M_ZERO); + M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else - txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC, + txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif txr->hn_tx_taskq = sc->hn_tx_taskq; if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); - txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC, + txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); if (!hn_use_if_start) { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_destroy_tx_ring(struct hn_tx_ring *txr) { struct hn_txdesc *txd; if (txr->hn_txdesc == NULL) return; #ifndef HN_USE_TXDESC_BUFRING while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) { SLIST_REMOVE_HEAD(&txr->hn_txlist, link); hn_txdesc_dmamap_destroy(txd); } #else mtx_lock(&txr->hn_tx_lock); while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) hn_txdesc_dmamap_destroy(txd); mtx_unlock(&txr->hn_tx_lock); #endif if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING - buf_ring_free(txr->hn_txdesc_br, M_NETVSC); + buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif - free(txr->hn_txdesc, M_NETVSC); + free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) - buf_ring_free(txr->hn_mbuf_br, M_NETVSC); + buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), - PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma, + PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, - M_NETVSC, M_WAITOK | M_ZERO); + M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_create_tx_ring(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { struct ifnet *ifp = sc->hn_ifp; int tso_minlen; if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (bootverbose) if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if (sc->hn_caps & HN_CAP_UDP4CS) csum_assist |= CSUM_IP_UDP; #ifdef notyet if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if (sc->hn_caps & HN_CAP_UDP6CS) csum_assist |= CSUM_IP6_UDP; #endif for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_destroy_tx_ring(&sc->hn_tx_ring[i]); - free(sc->hn_tx_ring, M_NETVSC); + free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); if (__predict_false(txr->hn_suspended)) return 0; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return 0; while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ drbr_advance(ifp, txr->hn_mbuf_br); continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } return 0; } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; - cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE; - cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE; + cbr.cbr_txsz = HN_TXBR_SIZE; + cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (txr != NULL) txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ vmbus_chan_close(chan); } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; if (subchan_cnt == 0) return (0); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { error = hn_chan_attach(sc, subchans[i]); if (error) break; } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan, i; uint32_t old_caps; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; + /* Clear RSS stuffs. */ + sc->hn_rss_ind_size = 0; + sc->hn_rss_hash = 0; + /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) return (error); /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) return (error); /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu); if (error) return (error); /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); /* Restore old capabilities and abort. */ sc->hn_caps = old_caps; return ENXIO; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) return (error); nchan = nsubch + 1; if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Configure RSS key and indirect table _after_ all sub-channels * are allocated. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } - /* TODO: Take ndis_rss_caps.ndis_nind into account. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) rss->rss_ind[i] = i % nchan; sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. */ hn_rss_ind_fixup(sc, nchan); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { /* * Failed to configure RSS key or indirect table; only * the primary channel can be used. */ nchan = 1; } back: /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ hn_set_ring_inuse(sc, nchan); /* * Attach the sub-channels, if any. */ error = hn_attach_subchans(sc); if (error) return (error); sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; return (0); } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_rx_drain(struct vmbus_channel *chan) { while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan)) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_suspend_data(struct hn_softc *sc) { struct vmbus_channel **subch = NULL; int i, nsubch; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* Wait for all pending sends to finish. */ while (hn_tx_ring_pending(txr)) pause("hnwtx", 1 /* 1 tick */); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } /* * Disable RX by clearing RX filter. */ - hn_rndis_set_rxfilter(sc, 0); - sc->hn_rx_filter = 0; + sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; + hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); /* * Drain RX/TX bufrings and interrupts. */ nsubch = sc->hn_rx_ring_inuse - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { for (i = 0; i < nsubch; ++i) hn_rx_drain(subch[i]); } hn_rx_drain(sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_set_rxfilter(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_tx_resume(sc, sc->hn_tx_ring_cnt); if (!hn_use_if_start) { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_network_change(sc); else hn_link_status_update(sc); } static void hn_resume(struct hn_softc *sc) { if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_resume_data(sc); hn_resume_mgmt(sc); } +static void +hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) +{ + const struct rndis_status_msg *msg; + int ofs; + + if (dlen < sizeof(*msg)) { + if_printf(sc->hn_ifp, "invalid RNDIS status\n"); + return; + } + msg = data; + + switch (msg->rm_status) { + case RNDIS_STATUS_MEDIA_CONNECT: + case RNDIS_STATUS_MEDIA_DISCONNECT: + hn_link_status_update(sc); + break; + + case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: + /* Not really useful; ignore. */ + break; + + case RNDIS_STATUS_NETWORK_CHANGE: + ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); + if (dlen < ofs + msg->rm_stbuflen || + msg->rm_stbuflen < sizeof(uint32_t)) { + if_printf(sc->hn_ifp, "network changed\n"); + } else { + uint32_t change; + + memcpy(&change, ((const uint8_t *)msg) + ofs, + sizeof(change)); + if_printf(sc->hn_ifp, "network changed, change %u\n", + change); + } + hn_network_change(sc); + break; + + default: + if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", + msg->rm_status); + break; + } +} + +static int +hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) +{ + const struct rndis_pktinfo *pi = info_data; + uint32_t mask = 0; + + while (info_dlen != 0) { + const void *data; + uint32_t dlen; + + if (__predict_false(info_dlen < sizeof(*pi))) + return (EINVAL); + if (__predict_false(info_dlen < pi->rm_size)) + return (EINVAL); + info_dlen -= pi->rm_size; + + if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) + return (EINVAL); + if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) + return (EINVAL); + dlen = pi->rm_size - pi->rm_pktinfooffset; + data = pi->rm_data; + + switch (pi->rm_type) { + case NDIS_PKTINFO_TYPE_VLAN: + if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) + return (EINVAL); + info->vlan_info = *((const uint32_t *)data); + mask |= HN_RXINFO_VLAN; + break; + + case NDIS_PKTINFO_TYPE_CSUM: + if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) + return (EINVAL); + info->csum_info = *((const uint32_t *)data); + mask |= HN_RXINFO_CSUM; + break; + + case HN_NDIS_PKTINFO_TYPE_HASHVAL: + if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) + return (EINVAL); + info->hash_value = *((const uint32_t *)data); + mask |= HN_RXINFO_HASHVAL; + break; + + case HN_NDIS_PKTINFO_TYPE_HASHINF: + if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) + return (EINVAL); + info->hash_info = *((const uint32_t *)data); + mask |= HN_RXINFO_HASHINF; + break; + + default: + goto next; + } + + if (mask == HN_RXINFO_ALL) { + /* All found; done */ + break; + } +next: + pi = (const struct rndis_pktinfo *) + ((const uint8_t *)pi + pi->rm_size); + } + + /* + * Final fixup. + * - If there is no hash value, invalidate the hash info. + */ + if ((mask & HN_RXINFO_HASHVAL) == 0) + info->hash_info = HN_NDIS_HASH_INFO_INVALID; + return (0); +} + +static __inline bool +hn_rndis_check_overlap(int off, int len, int check_off, int check_len) +{ + + if (off < check_off) { + if (__predict_true(off + len <= check_off)) + return (false); + } else if (off > check_off) { + if (__predict_true(check_off + check_len <= off)) + return (false); + } + return (true); +} + static void +hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) +{ + const struct rndis_packet_msg *pkt; + struct hn_rxinfo info; + int data_off, pktinfo_off, data_len, pktinfo_len; + + /* + * Check length. + */ + if (__predict_false(dlen < sizeof(*pkt))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); + return; + } + pkt = data; + + if (__predict_false(dlen < pkt->rm_len)) { + if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " + "dlen %d, msglen %u\n", dlen, pkt->rm_len); + return; + } + if (__predict_false(pkt->rm_len < + pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " + "msglen %u, data %u, oob %u, pktinfo %u\n", + pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, + pkt->rm_pktinfolen); + return; + } + if (__predict_false(pkt->rm_datalen == 0)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); + return; + } + + /* + * Check offests. + */ +#define IS_OFFSET_INVALID(ofs) \ + ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ + ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) + + /* XXX Hyper-V does not meet data offset alignment requirement */ + if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data offset %u\n", pkt->rm_dataoffset); + return; + } + if (__predict_false(pkt->rm_oobdataoffset > 0 && + IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob offset %u\n", pkt->rm_oobdataoffset); + return; + } + if (__predict_true(pkt->rm_pktinfooffset > 0) && + __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo offset %u\n", pkt->rm_pktinfooffset); + return; + } + +#undef IS_OFFSET_INVALID + + data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); + data_len = pkt->rm_datalen; + pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); + pktinfo_len = pkt->rm_pktinfolen; + + /* + * Check OOB coverage. + */ + if (__predict_false(pkt->rm_oobdatalen != 0)) { + int oob_off, oob_len; + + if_printf(rxr->hn_ifp, "got oobdata\n"); + oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); + oob_len = pkt->rm_oobdatalen; + + if (__predict_false(oob_off + oob_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overflow, msglen %u, oob abs %d len %d\n", + pkt->rm_len, oob_off, oob_len); + return; + } + + /* + * Check against data. + */ + if (hn_rndis_check_overlap(oob_off, oob_len, + data_off, data_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps data, oob abs %d len %d, " + "data abs %d len %d\n", + oob_off, oob_len, data_off, data_len); + return; + } + + /* + * Check against pktinfo. + */ + if (pktinfo_len != 0 && + hn_rndis_check_overlap(oob_off, oob_len, + pktinfo_off, pktinfo_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps pktinfo, oob abs %d len %d, " + "pktinfo abs %d len %d\n", + oob_off, oob_len, pktinfo_off, pktinfo_len); + return; + } + } + + /* + * Check per-packet-info coverage and find useful per-packet-info. + */ + info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; + info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; + info.hash_info = HN_NDIS_HASH_INFO_INVALID; + if (__predict_true(pktinfo_len != 0)) { + bool overlap; + int error; + + if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overflow, msglen %u, " + "pktinfo abs %d len %d\n", + pkt->rm_len, pktinfo_off, pktinfo_len); + return; + } + + /* + * Check packet info coverage. + */ + overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, + data_off, data_len); + if (__predict_false(overlap)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overlap data, pktinfo abs %d len %d, " + "data abs %d len %d\n", + pktinfo_off, pktinfo_len, data_off, data_len); + return; + } + + /* + * Find useful per-packet-info. + */ + error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, + pktinfo_len, &info); + if (__predict_false(error)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " + "pktinfo\n"); + return; + } + } + + if (__predict_false(data_off + data_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data overflow, msglen %u, data abs %d len %d\n", + pkt->rm_len, data_off, data_len); + return; + } + hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); +} + +static __inline void +hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) +{ + const struct rndis_msghdr *hdr; + + if (__predict_false(dlen < sizeof(*hdr))) { + if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); + return; + } + hdr = data; + + if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { + /* Hot data path. */ + hn_rndis_rx_data(rxr, data, dlen); + /* Done! */ + return; + } + + if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) + hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); + else + hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); +} + +static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { - struct hn_send_ctx *sndc; + struct hn_nvs_sendctx *sndc; - sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid; + sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void -hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, - struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) +hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; - if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) { + if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } - hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len); + hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } - + /* * Moved completion call back here so that all received * messages (not just data messages) will trigger a response * message back to the host. */ hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid); } /* * Net VSC on receive completion * * Send a receive completion packet to RNDIS device (ie NetVsp) */ static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries = 0; int ret = 0; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retry_send_cmplt: /* Send the completion */ ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (ret == 0) { /* success */ /* no-op */ } else if (ret == EAGAIN) { /* no more room... wait a bit and attempt to retry 3 times */ retries++; if (retries < 4) { DELAY(100); goto retry_send_cmplt; } } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; void *buffer; - int bufferlen = NETVSC_PACKET_SIZE; + int bufferlen = HN_PKTBUF_LEN; - buffer = rxr->hn_rdbuf; + buffer = rxr->hn_pktbuf; do { struct vmbus_chanpkt_hdr *pkt = buffer; uint32_t bytes_rxed; int ret; bytes_rxed = bufferlen; ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); if (ret == 0) { switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: - hn_nvs_handle_rxbuf(sc, rxr, chan, pkt); + hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } else if (ret == ENOBUFS) { /* Handle large packet */ - if (bufferlen > NETVSC_PACKET_SIZE) { - free(buffer, M_NETVSC); + if (bufferlen > HN_PKTBUF_LEN) { + free(buffer, M_DEVBUF); buffer = NULL; } /* alloc new buffer */ - buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT); + buffer = malloc(bytes_rxed, M_DEVBUF, M_NOWAIT); if (buffer == NULL) { if_printf(rxr->hn_ifp, "hv_cb malloc buffer failed, len=%u\n", bytes_rxed); bufferlen = 0; break; } bufferlen = bytes_rxed; } else { /* No more packets */ break; } } while (1); - if (bufferlen > NETVSC_PACKET_SIZE) - free(buffer, M_NETVSC); + if (bufferlen > HN_PKTBUF_LEN) + free(buffer, M_DEVBUF); - hv_rf_channel_rollup(rxr, rxr->hn_txr); + hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_tx_taskq_create(void *arg __unused) { if (!hn_share_tx_taskq) return; hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET, &cpu_set, "hn tx"); } else { taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx"); } } SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_create, NULL); static void hn_tx_taskq_destroy(void *arg __unused) { if (hn_tx_taskq != NULL) taskqueue_free(hn_tx_taskq); } SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_destroy, NULL); static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), DEVMETHOD(device_attach, netvsc_attach), DEVMETHOD(device_detach, netvsc_detach), DEVMETHOD(device_shutdown, netvsc_shutdown), { 0, 0 } }; static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, sizeof(struct hn_softc) }; static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_rndis_filter.c (revision 308054) @@ -1,1293 +1,991 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet6.h" +#include "opt_inet.h" + #include -#include #include -#include -#include +#include +#include + +#include + +#include #include -#include #include -#include +#include #include + #include #include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include #include -#include -#include -#include + #include +#include +#include +#include +#include -#define HV_RF_RECVINFO_VLAN 0x1 -#define HV_RF_RECVINFO_CSUM 0x2 -#define HV_RF_RECVINFO_HASHINF 0x4 -#define HV_RF_RECVINFO_HASHVAL 0x8 -#define HV_RF_RECVINFO_ALL \ - (HV_RF_RECVINFO_VLAN | \ - HV_RF_RECVINFO_CSUM | \ - HV_RF_RECVINFO_HASHINF | \ - HV_RF_RECVINFO_HASHVAL) - #define HN_RNDIS_RID_COMPAT_MASK 0xffff #define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK #define HN_RNDIS_XFER_SIZE 2048 #define HN_NDIS_TXCSUM_CAP_IP4 \ (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP4 \ (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP6 \ (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \ NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_TXCSUM_CAP_UDP6 \ (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_LSOV2_CAP_IP6 \ (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT) -/* - * Forward declarations - */ -static void hv_rf_receive_indicate_status(struct hn_softc *sc, - const void *data, int dlen); -static void hv_rf_receive_data(struct hn_rx_ring *rxr, - const void *data, int dlen); +static const void *hn_rndis_xact_exec1(struct hn_softc *, + struct vmbus_xact *, size_t, + struct hn_nvs_sendctx *, size_t *); +static const void *hn_rndis_xact_execute(struct hn_softc *, + struct vmbus_xact *, uint32_t, size_t, size_t *, + uint32_t); +static int hn_rndis_query(struct hn_softc *, uint32_t, + const void *, size_t, void *, size_t *); +static int hn_rndis_query2(struct hn_softc *, uint32_t, + const void *, size_t, void *, size_t *, size_t); +static int hn_rndis_set(struct hn_softc *, uint32_t, + const void *, size_t); +static int hn_rndis_init(struct hn_softc *); +static int hn_rndis_halt(struct hn_softc *); +static int hn_rndis_conf_offload(struct hn_softc *, int); +static int hn_rndis_query_hwcaps(struct hn_softc *, + struct ndis_offload *); -static int hn_rndis_query(struct hn_softc *sc, uint32_t oid, - const void *idata, size_t idlen, void *odata, size_t *odlen0); -static int hn_rndis_query2(struct hn_softc *sc, uint32_t oid, - const void *idata, size_t idlen, void *odata, size_t *odlen0, - size_t min_odlen); -static int hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, - size_t dlen); -static int hn_rndis_conf_offload(struct hn_softc *sc, int mtu); -static int hn_rndis_query_hwcaps(struct hn_softc *sc, - struct ndis_offload *caps); - static __inline uint32_t hn_rndis_rid(struct hn_softc *sc) { uint32_t rid; again: rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1); if (rid == 0) goto again; /* Use upper 16 bits for non-compat RNDIS messages. */ return ((rid & 0xffff) << 16); } -void * -hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, - size_t pi_dlen, uint32_t pi_type) -{ - const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); - struct rndis_pktinfo *pi; - - KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, - ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); - - /* - * Per-packet-info does not move; it only grows. - * - * NOTE: - * rm_pktinfooffset in this phase counts from the beginning - * of rndis_packet_msg. - */ - KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, - ("%u pktinfo overflows RNDIS packet msg", pi_type)); - pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + - pkt->rm_pktinfolen); - pkt->rm_pktinfolen += pi_size; - - pi->rm_size = pi_size; - pi->rm_type = pi_type; - pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; - - /* Data immediately follow per-packet-info. */ - pkt->rm_dataoffset += pi_size; - - /* Update RNDIS packet msg length */ - pkt->rm_len += pi_size; - - return (pi->rm_data); -} - -/* - * RNDIS filter receive indicate status - */ -static void -hv_rf_receive_indicate_status(struct hn_softc *sc, const void *data, int dlen) -{ - const struct rndis_status_msg *msg; - int ofs; - - if (dlen < sizeof(*msg)) { - if_printf(sc->hn_ifp, "invalid RNDIS status\n"); - return; - } - msg = data; - - switch (msg->rm_status) { - case RNDIS_STATUS_MEDIA_CONNECT: - case RNDIS_STATUS_MEDIA_DISCONNECT: - hn_link_status_update(sc); - break; - - case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: - /* Not really useful; ignore. */ - break; - - case RNDIS_STATUS_NETWORK_CHANGE: - ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); - if (dlen < ofs + msg->rm_stbuflen || - msg->rm_stbuflen < sizeof(uint32_t)) { - if_printf(sc->hn_ifp, "network changed\n"); - } else { - uint32_t change; - - memcpy(&change, ((const uint8_t *)msg) + ofs, - sizeof(change)); - if_printf(sc->hn_ifp, "network changed, change %u\n", - change); - } - hn_network_change(sc); - break; - - default: - /* TODO: */ - if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", - msg->rm_status); - break; - } -} - -static int -hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_recvinfo *info) -{ - const struct rndis_pktinfo *pi = info_data; - uint32_t mask = 0; - - while (info_dlen != 0) { - const void *data; - uint32_t dlen; - - if (__predict_false(info_dlen < sizeof(*pi))) - return (EINVAL); - if (__predict_false(info_dlen < pi->rm_size)) - return (EINVAL); - info_dlen -= pi->rm_size; - - if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) - return (EINVAL); - if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) - return (EINVAL); - dlen = pi->rm_size - pi->rm_pktinfooffset; - data = pi->rm_data; - - switch (pi->rm_type) { - case NDIS_PKTINFO_TYPE_VLAN: - if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) - return (EINVAL); - info->vlan_info = *((const uint32_t *)data); - mask |= HV_RF_RECVINFO_VLAN; - break; - - case NDIS_PKTINFO_TYPE_CSUM: - if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) - return (EINVAL); - info->csum_info = *((const uint32_t *)data); - mask |= HV_RF_RECVINFO_CSUM; - break; - - case HN_NDIS_PKTINFO_TYPE_HASHVAL: - if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) - return (EINVAL); - info->hash_value = *((const uint32_t *)data); - mask |= HV_RF_RECVINFO_HASHVAL; - break; - - case HN_NDIS_PKTINFO_TYPE_HASHINF: - if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) - return (EINVAL); - info->hash_info = *((const uint32_t *)data); - mask |= HV_RF_RECVINFO_HASHINF; - break; - - default: - goto next; - } - - if (mask == HV_RF_RECVINFO_ALL) { - /* All found; done */ - break; - } -next: - pi = (const struct rndis_pktinfo *) - ((const uint8_t *)pi + pi->rm_size); - } - - /* - * Final fixup. - * - If there is no hash value, invalidate the hash info. - */ - if ((mask & HV_RF_RECVINFO_HASHVAL) == 0) - info->hash_info = HN_NDIS_HASH_INFO_INVALID; - return (0); -} - -static __inline bool -hn_rndis_check_overlap(int off, int len, int check_off, int check_len) -{ - - if (off < check_off) { - if (__predict_true(off + len <= check_off)) - return (false); - } else if (off > check_off) { - if (__predict_true(check_off + check_len <= off)) - return (false); - } - return (true); -} - -/* - * RNDIS filter receive data - */ -static void -hv_rf_receive_data(struct hn_rx_ring *rxr, const void *data, int dlen) -{ - const struct rndis_packet_msg *pkt; - struct hn_recvinfo info; - int data_off, pktinfo_off, data_len, pktinfo_len; - - /* - * Check length. - */ - if (__predict_false(dlen < sizeof(*pkt))) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); - return; - } - pkt = data; - - if (__predict_false(dlen < pkt->rm_len)) { - if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " - "dlen %d, msglen %u\n", dlen, pkt->rm_len); - return; - } - if (__predict_false(pkt->rm_len < - pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " - "msglen %u, data %u, oob %u, pktinfo %u\n", - pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, - pkt->rm_pktinfolen); - return; - } - if (__predict_false(pkt->rm_datalen == 0)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); - return; - } - - /* - * Check offests. - */ -#define IS_OFFSET_INVALID(ofs) \ - ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ - ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) - - /* XXX Hyper-V does not meet data offset alignment requirement */ - if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "data offset %u\n", pkt->rm_dataoffset); - return; - } - if (__predict_false(pkt->rm_oobdataoffset > 0 && - IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "oob offset %u\n", pkt->rm_oobdataoffset); - return; - } - if (__predict_true(pkt->rm_pktinfooffset > 0) && - __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "pktinfo offset %u\n", pkt->rm_pktinfooffset); - return; - } - -#undef IS_OFFSET_INVALID - - data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); - data_len = pkt->rm_datalen; - pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); - pktinfo_len = pkt->rm_pktinfolen; - - /* - * Check OOB coverage. - */ - if (__predict_false(pkt->rm_oobdatalen != 0)) { - int oob_off, oob_len; - - if_printf(rxr->hn_ifp, "got oobdata\n"); - oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); - oob_len = pkt->rm_oobdatalen; - - if (__predict_false(oob_off + oob_len > pkt->rm_len)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "oob overflow, msglen %u, oob abs %d len %d\n", - pkt->rm_len, oob_off, oob_len); - return; - } - - /* - * Check against data. - */ - if (hn_rndis_check_overlap(oob_off, oob_len, - data_off, data_len)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "oob overlaps data, oob abs %d len %d, " - "data abs %d len %d\n", - oob_off, oob_len, data_off, data_len); - return; - } - - /* - * Check against pktinfo. - */ - if (pktinfo_len != 0 && - hn_rndis_check_overlap(oob_off, oob_len, - pktinfo_off, pktinfo_len)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "oob overlaps pktinfo, oob abs %d len %d, " - "pktinfo abs %d len %d\n", - oob_off, oob_len, pktinfo_off, pktinfo_len); - return; - } - } - - /* - * Check per-packet-info coverage and find useful per-packet-info. - */ - info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; - info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; - info.hash_info = HN_NDIS_HASH_INFO_INVALID; - if (__predict_true(pktinfo_len != 0)) { - bool overlap; - int error; - - if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "pktinfo overflow, msglen %u, " - "pktinfo abs %d len %d\n", - pkt->rm_len, pktinfo_off, pktinfo_len); - return; - } - - /* - * Check packet info coverage. - */ - overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, - data_off, data_len); - if (__predict_false(overlap)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "pktinfo overlap data, pktinfo abs %d len %d, " - "data abs %d len %d\n", - pktinfo_off, pktinfo_len, data_off, data_len); - return; - } - - /* - * Find useful per-packet-info. - */ - error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, - pktinfo_len, &info); - if (__predict_false(error)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " - "pktinfo\n"); - return; - } - } - - if (__predict_false(data_off + data_len > pkt->rm_len)) { - if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " - "data overflow, msglen %u, data abs %d len %d\n", - pkt->rm_len, data_off, data_len); - return; - } - hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); -} - -/* - * RNDIS filter on receive - */ void -hv_rf_on_receive(struct hn_softc *sc, struct hn_rx_ring *rxr, - const void *data, int dlen) +hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_comp_hdr *comp; const struct rndis_msghdr *hdr; - if (__predict_false(dlen < sizeof(*hdr))) { - if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); - return; - } + KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n")); hdr = data; switch (hdr->rm_type) { - case REMOTE_NDIS_PACKET_MSG: - hv_rf_receive_data(rxr, data, dlen); - break; - case REMOTE_NDIS_INITIALIZE_CMPLT: case REMOTE_NDIS_QUERY_CMPLT: case REMOTE_NDIS_SET_CMPLT: case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */ if (dlen < sizeof(*comp)) { - if_printf(rxr->hn_ifp, "invalid RNDIS cmplt\n"); + if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n"); return; } comp = data; KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid RNDIS rid 0x%08x\n", comp->rm_rid)); vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen); break; - case REMOTE_NDIS_INDICATE_STATUS_MSG: - hv_rf_receive_indicate_status(sc, data, dlen); - break; - case REMOTE_NDIS_RESET_CMPLT: /* * Reset completed, no rid. * * NOTE: * RESET is not issued by hn(4), so this message should * _not_ be observed. */ - if_printf(rxr->hn_ifp, "RESET cmplt received\n"); + if_printf(sc->hn_ifp, "RESET cmplt received\n"); break; default: - if_printf(rxr->hn_ifp, "unknown RNDIS msg 0x%x\n", + if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n", hdr->rm_type); break; } } int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr) { size_t eaddr_len; int error; eaddr_len = ETHER_ADDR_LEN; error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0, eaddr, &eaddr_len); if (error) return (error); if (eaddr_len != ETHER_ADDR_LEN) { if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len); return (EINVAL); } return (0); } int hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status) { size_t size; int error; size = sizeof(*link_status); error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0, link_status, &size); if (error) return (error); if (size != sizeof(uint32_t)) { if_printf(sc->hn_ifp, "invalid link status len %zu\n", size); return (EINVAL); } return (0); } static const void * hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen, - struct hn_send_ctx *sndc, size_t *comp_len) + struct hn_nvs_sendctx *sndc, size_t *comp_len) { struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT]; int gpa_cnt, error; bus_addr_t paddr; KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0, ("invalid request length %zu", reqlen)); /* * Setup the SG list. */ paddr = vmbus_xact_req_paddr(xact); KASSERT((paddr & PAGE_MASK) == 0, ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr)); for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) { int len = PAGE_SIZE; if (reqlen == 0) break; if (reqlen < len) len = reqlen; gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt; gpa[gpa_cnt].gpa_len = len; gpa[gpa_cnt].gpa_ofs = 0; reqlen -= len; } KASSERT(reqlen == 0, ("still have %zu request data left", reqlen)); /* * Send this RNDIS control message and wait for its completion * message. */ vmbus_xact_activate(xact); error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt); if (error) { vmbus_xact_deactivate(xact); if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error); return (NULL); } return (vmbus_xact_wait(xact, comp_len)); } static const void * hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid, size_t reqlen, size_t *comp_len0, uint32_t comp_type) { const struct rndis_comp_hdr *comp; size_t comp_len, min_complen = *comp_len0; KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid)); KASSERT(min_complen >= sizeof(*comp), ("invalid minimum complete len %zu", min_complen)); /* * Execute the xact setup by the caller. */ - comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_send_ctx_none, + comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none, &comp_len); if (comp == NULL) return (NULL); /* * Check this RNDIS complete message. */ if (comp_len < min_complen) { if (comp_len >= sizeof(*comp)) { /* rm_status field is valid */ if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, " "status 0x%08x\n", comp_len, comp->rm_status); } else { if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n", comp_len); } return (NULL); } if (comp->rm_len < min_complen) { if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n", comp->rm_len); return (NULL); } if (comp->rm_type != comp_type) { if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, " "expect 0x%08x\n", comp->rm_type, comp_type); return (NULL); } if (comp->rm_rid != rid) { if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, " "expect %u\n", comp->rm_rid, rid); return (NULL); } /* All pass! */ *comp_len0 = comp_len; return (comp); } static int hn_rndis_query(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0) { return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0)); } static int hn_rndis_query2(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0, size_t min_odlen) { struct rndis_query_req *req; const struct rndis_query_comp *comp; struct vmbus_xact *xact; size_t reqlen, odlen = *odlen0, comp_len; int error, ofs; uint32_t rid; reqlen = sizeof(*req) + idlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_QUERY_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; /* * XXX * This is _not_ RNDIS Spec conforming: * "This MUST be set to 0 when there is no input data * associated with the OID." * * If this field was set to 0 according to the RNDIS Spec, * Hyper-V would set non-SUCCESS status in the query * completion. */ req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET; if (idlen > 0) { req->rm_infobuflen = idlen; /* Input data immediately follows RNDIS query. */ memcpy(req + 1, idata, idlen); } comp_len = sizeof(*comp) + min_odlen; comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_QUERY_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) { /* No output data! */ if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid); *odlen0 = 0; error = 0; goto done; } /* * Check output data length and offset. */ /* ofs is the offset from the beginning of comp. */ ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset); if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) { if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, " "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen); error = EINVAL; goto done; } /* * Save output data. */ if (comp->rm_infobuflen < odlen) odlen = comp->rm_infobuflen; memcpy(odata, ((const uint8_t *)comp) + ofs, odlen); *odlen0 = odlen; error = 0; done: vmbus_xact_put(xact); return (error); } int -hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt) +hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0) { struct ndis_rss_caps in, caps; size_t caps_len; - int error; + int error, indsz, rxr_cnt, hash_fnidx; + uint32_t hash_func = 0, hash_types = 0; - *rxr_cnt = 0; + *rxr_cnt0 = 0; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20) return (EOPNOTSUPP); memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS; in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2; in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE; caps_len = NDIS_RSS_CAPS_SIZE; error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES, &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps.ndis_hdr.ndis_type); return (EINVAL); } if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps.ndis_hdr.ndis_rev); return (EINVAL); } if (caps.ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps.ndis_hdr.ndis_size); return (EINVAL); } + /* + * Save information for later RSS configuration. + */ if (caps.ndis_nrxr == 0) { if_printf(sc->hn_ifp, "0 RX rings!?\n"); return (EINVAL); } - *rxr_cnt = caps.ndis_nrxr; + if (bootverbose) + if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr); + rxr_cnt = caps.ndis_nrxr; - if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE) { + if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE && + caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) { + if (caps.ndis_nind > NDIS_HASH_INDCNT) { + if_printf(sc->hn_ifp, + "too many RSS indirect table entries %u\n", + caps.ndis_nind); + return (EOPNOTSUPP); + } + if (!powerof2(caps.ndis_nind)) { + if_printf(sc->hn_ifp, "RSS indirect table size is not " + "power-of-2 %u\n", caps.ndis_nind); + } + if (bootverbose) { if_printf(sc->hn_ifp, "RSS indirect table size %u\n", caps.ndis_nind); } + indsz = caps.ndis_nind; + } else { + indsz = NDIS_HASH_INDCNT; } + if (indsz < rxr_cnt) { + if_printf(sc->hn_ifp, "# of RX rings (%d) > " + "RSS indirect table size %d\n", rxr_cnt, indsz); + rxr_cnt = indsz; + } + + /* + * NOTE: + * Toeplitz is at the lowest bit, and it is prefered; so ffs(), + * instead of fls(), is used here. + */ + hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK); + if (hash_fnidx == 0) { + if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n", + caps.ndis_caps); + return (EOPNOTSUPP); + } + hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */ + + if (caps.ndis_caps & NDIS_RSS_CAP_IPV4) + hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4; + if (caps.ndis_caps & NDIS_RSS_CAP_IPV6) + hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; + if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX) + hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX; + if (hash_types == 0) { + if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n", + caps.ndis_caps); + return (EOPNOTSUPP); + } + + /* Commit! */ + sc->hn_rss_ind_size = indsz; + sc->hn_rss_hash = hash_func | hash_types; + *rxr_cnt0 = rxr_cnt; return (0); } static int hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen) { struct rndis_set_req *req; const struct rndis_set_comp *comp; struct vmbus_xact *xact; size_t reqlen, comp_len; uint32_t rid; int error; KASSERT(dlen > 0, ("invalid dlen %zu", dlen)); reqlen = sizeof(*req) + dlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_SET_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; req->rm_infobuflen = dlen; req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET; /* Data immediately follows RNDIS set. */ memcpy(req + 1, data, dlen); comp_len = sizeof(*comp); comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_SET_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } error = 0; done: vmbus_xact_put(xact); return (error); } static int hn_rndis_conf_offload(struct hn_softc *sc, int mtu) { struct ndis_offload hwcaps; struct ndis_offload_params params; uint32_t caps = 0; size_t paramsz; int error, tso_maxsz, tso_minsg; error = hn_rndis_query_hwcaps(sc, &hwcaps); if (error) { if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error); return (error); } /* NOTE: 0 means "no change" */ memset(¶ms, 0, sizeof(params)); params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2; paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1; } else { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3; paramsz = NDIS_OFFLOAD_PARAMS_SIZE; } params.ndis_hdr.ndis_size = paramsz; /* * TSO4/TSO6 setup. */ tso_maxsz = IP_MAXPACKET; tso_minsg = 2; if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) { caps |= HN_CAP_TSO4; params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz; if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg; } if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) && (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) == HN_NDIS_LSOV2_CAP_IP6) { #ifdef notyet caps |= HN_CAP_TSO6; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz; if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg; #endif } sc->hn_ndis_tso_szmax = 0; sc->hn_ndis_tso_sgmin = 0; if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) { KASSERT(tso_maxsz <= IP_MAXPACKET, ("invalid NDIS TSO maxsz %d", tso_maxsz)); KASSERT(tso_minsg >= 2, ("invalid NDIS TSO minsg %d", tso_minsg)); if (tso_maxsz < tso_minsg * mtu) { if_printf(sc->hn_ifp, "invalid NDIS TSO config: " "maxsz %d, minsg %d, mtu %d; " "disable TSO4 and TSO6\n", tso_maxsz, tso_minsg, mtu); caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6); params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF; } else { sc->hn_ndis_tso_szmax = tso_maxsz; sc->hn_ndis_tso_sgmin = tso_minsg; if (bootverbose) { if_printf(sc->hn_ifp, "NDIS TSO " "szmax %d sgmin %d\n", sc->hn_ndis_tso_szmax, sc->hn_ndis_tso_sgmin); } } } /* IPv4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) == HN_NDIS_TXCSUM_CAP_IP4) { caps |= HN_CAP_IPCS; params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) { if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) == HN_NDIS_TXCSUM_CAP_TCP4) { caps |= HN_CAP_TCP4CS; params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) { if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP4 checksum */ if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { caps |= HN_CAP_UDP4CS; params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) { if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) == HN_NDIS_TXCSUM_CAP_TCP6) { caps |= HN_CAP_TCP6CS; params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) { if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) == HN_NDIS_TXCSUM_CAP_UDP6) { caps |= HN_CAP_UDP6CS; params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) { if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX; } if (bootverbose) { if_printf(sc->hn_ifp, "offload csum: " "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n", params.ndis_ip4csum, params.ndis_tcp4csum, params.ndis_udp4csum, params.ndis_tcp6csum, params.ndis_udp6csum); if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n", params.ndis_lsov2_ip4, params.ndis_lsov2_ip6); } error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, ¶ms, paramsz); if (error) { if_printf(sc->hn_ifp, "offload config failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "offload config done\n"); sc->hn_caps |= caps; return (0); } int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; struct ndis_rss_params *prm = &rss->rss_params; - int error; + int error, rss_size; /* * Only NDIS 6.20+ is supported: * We only support 4bytes element in indirect table, which has been * adopted since NDIS 6.20. */ KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20, ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver)); + /* XXX only one can be specified through, popcnt? */ + KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK), ("no hash func")); + KASSERT((sc->hn_rss_hash & NDIS_HASH_TYPE_MASK), ("no hash types")); + KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size")); + + if (bootverbose) { + if_printf(sc->hn_ifp, "RSS indirect table size %d, " + "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash); + } + /* * NOTE: * DO NOT whack rss_key and rss_ind, which are setup by the caller. */ memset(prm, 0, sizeof(*prm)); + rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size); prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS; prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2; - prm->ndis_hdr.ndis_size = sizeof(*rss); + prm->ndis_hdr.ndis_size = rss_size; prm->ndis_flags = flags; - prm->ndis_hash = NDIS_HASH_FUNCTION_TOEPLITZ | - NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4 | - NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; - /* TODO: Take ndis_rss_caps.ndis_nind into account */ - prm->ndis_indsize = sizeof(rss->rss_ind); + prm->ndis_hash = sc->hn_rss_hash; + prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size; prm->ndis_indoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]); prm->ndis_keysize = sizeof(rss->rss_key); prm->ndis_keyoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]); error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, - rss, sizeof(*rss)); + rss, rss_size); if (error) { if_printf(sc->hn_ifp, "RSS config failed: %d\n", error); } else { if (bootverbose) if_printf(sc->hn_ifp, "RSS config done\n"); } return (error); } int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error; error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER, &filter, sizeof(filter)); if (error) { if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n", filter, error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n", filter); } } return (error); } static int hn_rndis_init(struct hn_softc *sc) { struct rndis_init_req *req; const struct rndis_init_comp *comp; struct vmbus_xact *xact; size_t comp_len; uint32_t rid; int error; xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS init\n"); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_INITIALIZE_MSG; req->rm_len = sizeof(*req); req->rm_rid = rid; req->rm_ver_major = RNDIS_VERSION_MAJOR; req->rm_ver_minor = RNDIS_VERSION_MINOR; req->rm_max_xfersz = HN_RNDIS_XFER_SIZE; comp_len = RNDIS_INIT_COMP_SIZE_MIN; comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len, REMOTE_NDIS_INITIALIZE_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS init failed\n"); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n", comp->rm_status); error = EIO; goto done; } if (bootverbose) { if_printf(sc->hn_ifp, "RNDIS ver %u.%u, pktsz %u, pktcnt %u, " "align %u\n", comp->rm_ver_major, comp->rm_ver_minor, comp->rm_pktmaxsz, comp->rm_pktmaxcnt, 1U << comp->rm_align); } error = 0; done: vmbus_xact_put(xact); return (error); } static int hn_rndis_halt(struct hn_softc *sc) { struct vmbus_xact *xact; struct rndis_halt_req *halt; - struct hn_send_ctx sndc; + struct hn_nvs_sendctx sndc; size_t comp_len; xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS halt\n"); return (ENXIO); } halt = vmbus_xact_req_data(xact); halt->rm_type = REMOTE_NDIS_HALT_MSG; halt->rm_len = sizeof(*halt); halt->rm_rid = hn_rndis_rid(sc); /* No RNDIS completion; rely on NVS message send completion */ - hn_send_ctx_init(&sndc, hn_nvs_sent_xact, xact); + hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len); vmbus_xact_put(xact); if (bootverbose) if_printf(sc->hn_ifp, "RNDIS halt done\n"); return (0); } static int hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps) { struct ndis_offload in; size_t caps_len, size; int error; memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD; if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3; size = NDIS_OFFLOAD_SIZE; } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2; size = NDIS_OFFLOAD_SIZE_6_1; } else { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1; size = NDIS_OFFLOAD_SIZE_6_0; } in.ndis_hdr.ndis_size = size; caps_len = NDIS_OFFLOAD_SIZE; error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps->ndis_hdr.ndis_type); return (EINVAL); } if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps->ndis_hdr.ndis_rev); return (EINVAL); } if (caps->ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps->ndis_hdr.ndis_size); return (EINVAL); } if (bootverbose) { /* * NOTE: * caps->ndis_hdr.ndis_size MUST be checked before accessing * NDIS 6.1+ specific fields. */ if_printf(sc->hn_ifp, "hwcaps rev %u\n", caps->ndis_hdr.ndis_rev); if_printf(sc->hn_ifp, "hwcaps csum: " "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, " "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n", caps->ndis_csum.ndis_ip4_txcsum, caps->ndis_csum.ndis_ip4_txenc, caps->ndis_csum.ndis_ip4_rxcsum, caps->ndis_csum.ndis_ip4_rxenc, caps->ndis_csum.ndis_ip6_txcsum, caps->ndis_csum.ndis_ip6_txenc, caps->ndis_csum.ndis_ip6_rxcsum, caps->ndis_csum.ndis_ip6_rxenc); if_printf(sc->hn_ifp, "hwcaps lsov2: " "ip4 maxsz %u minsg %u encap 0x%x, " "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n", caps->ndis_lsov2.ndis_ip4_maxsz, caps->ndis_lsov2.ndis_ip4_minsg, caps->ndis_lsov2.ndis_ip4_encap, caps->ndis_lsov2.ndis_ip6_maxsz, caps->ndis_lsov2.ndis_ip6_minsg, caps->ndis_lsov2.ndis_ip6_encap, caps->ndis_lsov2.ndis_ip6_opts); } return (0); } int hn_rndis_attach(struct hn_softc *sc, int mtu) { int error; /* * Initialize RNDIS. */ error = hn_rndis_init(sc); if (error) return (error); /* * Configure NDIS offload settings. * XXX no offloading, if error happened? */ hn_rndis_conf_offload(sc, mtu); return (0); } void hn_rndis_detach(struct hn_softc *sc) { /* Halt the RNDIS. */ hn_rndis_halt(sc); -} - -void -hv_rf_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) -{ - - hn_chan_rollup(rxr, txr); } Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/if_hnvar.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/if_hnvar.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/if_hnvar.h (revision 308054) @@ -1,129 +1,234 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IF_HNVAR_H_ #define _IF_HNVAR_H_ -#include +#define HN_USE_TXDESC_BUFRING -#include -#include +#define HN_CHIM_SIZE (15 * 1024 * 1024) -struct hn_softc; +#define HN_RXBUF_SIZE (16 * 1024 * 1024) +#define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024) -struct vmbus_channel; -struct hn_send_ctx; +/* Claimed to be 12232B */ +#define HN_MTU_MAX (9 * 1024) -typedef void (*hn_sent_callback_t) - (struct hn_send_ctx *, struct hn_softc *, - struct vmbus_channel *, const void *, int); +#define HN_PKTBUF_LEN 4096 -struct hn_send_ctx { - hn_sent_callback_t hn_cb; - void *hn_cbarg; -}; +#define HN_TXBR_SIZE (128 * PAGE_SIZE) +#define HN_RXBR_SIZE (128 * PAGE_SIZE) -#define HN_NDIS_VLAN_INFO_INVALID 0xffffffff -#define HN_NDIS_RXCSUM_INFO_INVALID 0 -#define HN_NDIS_HASH_INFO_INVALID 0 +#define HN_XACT_REQ_PGCNT 2 +#define HN_XACT_RESP_PGCNT 2 +#define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) +#define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) -struct hn_recvinfo { - uint32_t vlan_info; - uint32_t csum_info; - uint32_t hash_info; - uint32_t hash_value; -}; +#define HN_GPACNT_MAX 32 -#define HN_SEND_CTX_INITIALIZER(cb, cbarg) \ -{ \ - .hn_cb = cb, \ - .hn_cbarg = cbarg \ -} +struct hn_txdesc; +#ifndef HN_USE_TXDESC_BUFRING +SLIST_HEAD(hn_txdesc_list, hn_txdesc); +#else +struct buf_ring; +#endif +struct hn_tx_ring; -static __inline void -hn_send_ctx_init(struct hn_send_ctx *sndc, hn_sent_callback_t cb, void *cbarg) -{ +struct hn_rx_ring { + struct ifnet *hn_ifp; + struct hn_tx_ring *hn_txr; + void *hn_pktbuf; + uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ + int hn_rx_idx; - sndc->hn_cb = cb; - sndc->hn_cbarg = cbarg; -} + /* Trust csum verification on host side */ + int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ + struct lro_ctrl hn_lro; -static __inline int -hn_nvs_send(struct vmbus_channel *chan, uint16_t flags, - void *nvs_msg, int nvs_msglen, struct hn_send_ctx *sndc) -{ + u_long hn_csum_ip; + u_long hn_csum_tcp; + u_long hn_csum_udp; + u_long hn_csum_trusted; + u_long hn_lro_tried; + u_long hn_small_pkts; + u_long hn_pkts; + u_long hn_rss_pkts; - return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags, - nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); -} + /* Rarely used stuffs */ + struct sysctl_oid *hn_rx_sysctl_tree; + int hn_rx_flags; -static __inline int -hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, - void *nvs_msg, int nvs_msglen, struct hn_send_ctx *sndc) -{ + void *hn_br; /* TX/RX bufring */ + struct hyperv_dma hn_br_dma; +} __aligned(CACHE_LINE_SIZE); - return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen, - (uint64_t)(uintptr_t)sndc)); -} +#define HN_TRUST_HCSUM_IP 0x0001 +#define HN_TRUST_HCSUM_TCP 0x0002 +#define HN_TRUST_HCSUM_UDP 0x0004 -struct vmbus_xact; -struct rndis_packet_msg; +#define HN_RX_FLAG_ATTACHED 0x1 -uint32_t hn_chim_alloc(struct hn_softc *sc); -void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx); +struct hn_tx_ring { +#ifndef HN_USE_TXDESC_BUFRING + struct mtx hn_txlist_spin; + struct hn_txdesc_list hn_txlist; +#else + struct buf_ring *hn_txdesc_br; +#endif + int hn_txdesc_cnt; + int hn_txdesc_avail; + u_short hn_has_txeof; + u_short hn_txdone_cnt; -int hn_rndis_attach(struct hn_softc *sc, int mtu); -void hn_rndis_detach(struct hn_softc *sc); -int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags); -void *hn_rndis_pktinfo_append(struct rndis_packet_msg *, - size_t pktsize, size_t pi_dlen, uint32_t pi_type); -int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt); -int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr); -int hn_rndis_get_linkstatus(struct hn_softc *sc, - uint32_t *link_status); -/* filter: NDIS_PACKET_TYPE_ or 0. */ -int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter); + int hn_sched_tx; + void (*hn_txeof)(struct hn_tx_ring *); + struct taskqueue *hn_tx_taskq; + struct task hn_tx_task; + struct task hn_txeof_task; -int hn_nvs_attach(struct hn_softc *sc, int mtu); -void hn_nvs_detach(struct hn_softc *sc); -int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch); -void hn_nvs_sent_xact(struct hn_send_ctx *sndc, struct hn_softc *sc, - struct vmbus_channel *chan, const void *data, int dlen); -int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, - struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, - int gpa_cnt); + struct buf_ring *hn_mbuf_br; + int hn_oactive; + int hn_tx_idx; + int hn_tx_flags; -int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, - const struct hn_recvinfo *info); -void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr); -void hn_link_status_update(struct hn_softc *sc); -void hn_network_change(struct hn_softc *sc); + struct mtx hn_tx_lock; + struct hn_softc *hn_sc; + struct vmbus_channel *hn_chan; -extern struct hn_send_ctx hn_send_ctx_none; + int hn_direct_tx_size; + int hn_chim_size; + bus_dma_tag_t hn_tx_data_dtag; + uint64_t hn_csum_assist; + + int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); + int hn_suspended; + int hn_gpa_cnt; + struct vmbus_gpa hn_gpa[HN_GPACNT_MAX]; + + u_long hn_no_txdescs; + u_long hn_send_failed; + u_long hn_txdma_failed; + u_long hn_tx_collapsed; + u_long hn_tx_chimney_tried; + u_long hn_tx_chimney; + u_long hn_pkts; + + /* Rarely used stuffs */ + struct hn_txdesc *hn_txdesc; + bus_dma_tag_t hn_tx_rndis_dtag; + struct sysctl_oid *hn_tx_sysctl_tree; +} __aligned(CACHE_LINE_SIZE); + +#define HN_TX_FLAG_ATTACHED 0x1 +#define HN_TX_FLAG_HASHVAL 0x2 /* support HASHVAL pktinfo */ + +/* + * Device-specific softc structure + */ +struct hn_softc { + struct ifnet *hn_ifp; + struct ifmedia hn_media; + device_t hn_dev; + int hn_if_flags; + struct sx hn_lock; + struct vmbus_channel *hn_prichan; + + int hn_rx_ring_cnt; + int hn_rx_ring_inuse; + struct hn_rx_ring *hn_rx_ring; + + int hn_tx_ring_cnt; + int hn_tx_ring_inuse; + struct hn_tx_ring *hn_tx_ring; + + uint8_t *hn_chim; + u_long *hn_chim_bmap; + int hn_chim_bmap_cnt; + int hn_chim_cnt; + int hn_chim_szmax; + + int hn_cpu; + struct taskqueue *hn_tx_taskq; + struct sysctl_oid *hn_tx_sysctl_tree; + struct sysctl_oid *hn_rx_sysctl_tree; + struct vmbus_xact_ctx *hn_xact; + uint32_t hn_nvs_ver; + uint32_t hn_rx_filter; + + struct taskqueue *hn_mgmt_taskq; + struct taskqueue *hn_mgmt_taskq0; + struct task hn_link_task; + struct task hn_netchg_init; + struct timeout_task hn_netchg_status; + uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ + + uint32_t hn_caps; /* HN_CAP_ */ + uint32_t hn_flags; /* HN_FLAG_ */ + void *hn_rxbuf; + uint32_t hn_rxbuf_gpadl; + struct hyperv_dma hn_rxbuf_dma; + + uint32_t hn_chim_gpadl; + struct hyperv_dma hn_chim_dma; + + uint32_t hn_rndis_rid; + uint32_t hn_ndis_ver; + int hn_ndis_tso_szmax; + int hn_ndis_tso_sgmin; + + int hn_rss_ind_size; + uint32_t hn_rss_hash; /* NDIS_HASH_ */ + struct ndis_rssprm_toeplitz hn_rss; +}; + +#define HN_FLAG_RXBUF_CONNECTED 0x0001 +#define HN_FLAG_CHIM_CONNECTED 0x0002 +#define HN_FLAG_HAS_RSSKEY 0x0004 +#define HN_FLAG_HAS_RSSIND 0x0008 +#define HN_FLAG_SYNTH_ATTACHED 0x0010 + +#define HN_CAP_VLAN 0x0001 +#define HN_CAP_MTU 0x0002 +#define HN_CAP_IPCS 0x0004 +#define HN_CAP_TCP4CS 0x0008 +#define HN_CAP_TCP6CS 0x0010 +#define HN_CAP_UDP4CS 0x0020 +#define HN_CAP_UDP6CS 0x0040 +#define HN_CAP_TSO4 0x0080 +#define HN_CAP_TSO6 0x0100 +#define HN_CAP_HASHVAL 0x0200 + +/* Capability description for use with printf(9) %b identifier. */ +#define HN_CAP_BITS \ + "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \ + "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL" + +#define HN_LINK_FLAG_LINKUP 0x0001 +#define HN_LINK_FLAG_NETCHG 0x0002 #endif /* !_IF_HNVAR_H_ */ Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/ndis.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/ndis.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/ndis.h (revision 308054) @@ -1,390 +1,398 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_NDIS_H_ #define _NET_NDIS_H_ #define NDIS_MEDIA_STATE_CONNECTED 0 #define NDIS_MEDIA_STATE_DISCONNECTED 1 #define NDIS_NETCHANGE_TYPE_POSSIBLE 1 #define NDIS_NETCHANGE_TYPE_DEFINITE 2 #define NDIS_NETCHANGE_TYPE_FROMMEDIA 3 #define NDIS_OFFLOAD_SET_NOCHG 0 #define NDIS_OFFLOAD_SET_ON 1 #define NDIS_OFFLOAD_SET_OFF 2 /* a.k.a GRE MAC */ #define NDIS_ENCAP_TYPE_NVGRE 0x00000001 #define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */ #define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */ /* hash function */ #define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001 /* hash type */ #define NDIS_HASH_IPV4 0x00000100 #define NDIS_HASH_TCP_IPV4 0x00000200 #define NDIS_HASH_IPV6 0x00000400 #define NDIS_HASH_IPV6_EX 0x00000800 #define NDIS_HASH_TCP_IPV6 0x00001000 #define NDIS_HASH_TCP_IPV6_EX 0x00002000 +/* Hash description for use with printf(9) %b identifier. */ +#define NDIS_HASH_BITS \ + "\20\1TOEPLITZ\11IP4\12TCP4\13IP6\14IP6EX\15TCP6\16TCP6EX" + #define NDIS_HASH_KEYSIZE_TOEPLITZ 40 #define NDIS_HASH_INDCNT 128 #define NDIS_OBJTYPE_DEFAULT 0x80 #define NDIS_OBJTYPE_RSS_CAPS 0x88 #define NDIS_OBJTYPE_RSS_PARAMS 0x89 #define NDIS_OBJTYPE_OFFLOAD 0xa7 struct ndis_object_hdr { uint8_t ndis_type; /* NDIS_OBJTYPE_ */ uint8_t ndis_rev; /* type specific */ uint16_t ndis_size; /* incl. this hdr */ }; /* * OID_TCP_OFFLOAD_PARAMETERS * ndis_type: NDIS_OBJTYPE_DEFAULT */ struct ndis_offload_params { struct ndis_object_hdr ndis_hdr; uint8_t ndis_ip4csum; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_tcp4csum; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_udp4csum; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_tcp6csum; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_udp6csum; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_lsov1; /* NDIS_OFFLOAD_PARAM_ */ uint8_t ndis_ipsecv1; /* NDIS_OFFLOAD_IPSECV1_ */ uint8_t ndis_lsov2_ip4; /* NDIS_OFFLOAD_LSOV2_ */ uint8_t ndis_lsov2_ip6; /* NDIS_OFFLOAD_LSOV2_ */ uint8_t ndis_tcp4conn; /* 0 */ uint8_t ndis_tcp6conn; /* 0 */ uint32_t ndis_flags; /* 0 */ /* NDIS >= 6.1 */ uint8_t ndis_ipsecv2; /* NDIS_OFFLOAD_IPSECV2_ */ uint8_t ndis_ipsecv2_ip4;/* NDIS_OFFLOAD_IPSECV2_ */ /* NDIS >= 6.30 */ uint8_t ndis_rsc_ip4; /* NDIS_OFFLOAD_RSC_ */ uint8_t ndis_rsc_ip6; /* NDIS_OFFLOAD_RSC_ */ uint8_t ndis_encap; /* NDIS_OFFLOAD_SET_ */ uint8_t ndis_encap_types;/* NDIS_ENCAP_TYPE_ */ }; #define NDIS_OFFLOAD_PARAMS_SIZE sizeof(struct ndis_offload_params) #define NDIS_OFFLOAD_PARAMS_SIZE_6_1 \ __offsetof(struct ndis_offload_params, ndis_rsc_ip4) #define NDIS_OFFLOAD_PARAMS_REV_2 2 /* NDIS 6.1 */ #define NDIS_OFFLOAD_PARAMS_REV_3 3 /* NDIS 6.30 */ #define NDIS_OFFLOAD_PARAM_NOCHG 0 /* common */ #define NDIS_OFFLOAD_PARAM_OFF 1 #define NDIS_OFFLOAD_PARAM_TX 2 #define NDIS_OFFLOAD_PARAM_RX 3 #define NDIS_OFFLOAD_PARAM_TXRX 4 /* NDIS_OFFLOAD_PARAM_NOCHG */ #define NDIS_OFFLOAD_LSOV1_OFF 1 #define NDIS_OFFLOAD_LSOV1_ON 2 /* NDIS_OFFLOAD_PARAM_NOCHG */ #define NDIS_OFFLOAD_IPSECV1_OFF 1 #define NDIS_OFFLOAD_IPSECV1_AH 2 #define NDIS_OFFLOAD_IPSECV1_ESP 3 #define NDIS_OFFLOAD_IPSECV1_AH_ESP 4 /* NDIS_OFFLOAD_PARAM_NOCHG */ #define NDIS_OFFLOAD_LSOV2_OFF 1 #define NDIS_OFFLOAD_LSOV2_ON 2 /* NDIS_OFFLOAD_PARAM_NOCHG */ #define NDIS_OFFLOAD_IPSECV2_OFF 1 #define NDIS_OFFLOAD_IPSECV2_AH 2 #define NDIS_OFFLOAD_IPSECV2_ESP 3 #define NDIS_OFFLOAD_IPSECV2_AH_ESP 4 /* NDIS_OFFLOAD_PARAM_NOCHG */ #define NDIS_OFFLOAD_RSC_OFF 1 #define NDIS_OFFLOAD_RSC_ON 2 /* * OID_GEN_RECEIVE_SCALE_CAPABILITIES * ndis_type: NDIS_OBJTYPE_RSS_CAPS */ struct ndis_rss_caps { struct ndis_object_hdr ndis_hdr; - uint32_t ndis_flags; /* NDIS_RSS_CAP_ */ + uint32_t ndis_caps; /* NDIS_RSS_CAP_ */ uint32_t ndis_nmsi; /* # of MSIs */ uint32_t ndis_nrxr; /* # of RX rings */ /* NDIS >= 6.30 */ uint16_t ndis_nind; /* # of indtbl ent. */ uint16_t ndis_pad; }; #define NDIS_RSS_CAPS_SIZE \ __offsetof(struct ndis_rss_caps, ndis_pad) #define NDIS_RSS_CAPS_SIZE_6_0 \ __offsetof(struct ndis_rss_caps, ndis_nind) #define NDIS_RSS_CAPS_REV_1 1 /* NDIS 6.{0,1,20} */ #define NDIS_RSS_CAPS_REV_2 2 /* NDIS 6.30 */ #define NDIS_RSS_CAP_MSI 0x01000000 #define NDIS_RSS_CAP_CLASSIFY_ISR 0x02000000 #define NDIS_RSS_CAP_CLASSIFY_DPC 0x04000000 #define NDIS_RSS_CAP_MSIX 0x08000000 #define NDIS_RSS_CAP_IPV4 0x00000100 #define NDIS_RSS_CAP_IPV6 0x00000200 #define NDIS_RSS_CAP_IPV6_EX 0x00000400 -#define NDIS_RSS_CAP_HASH_TOEPLITZ 0x00000001 +#define NDIS_RSS_CAP_HASH_TOEPLITZ NDIS_HASH_FUNCTION_TOEPLITZ +#define NDIS_RSS_CAP_HASHFUNC_MASK NDIS_HASH_FUNCTION_MASK /* * OID_GEN_RECEIVE_SCALE_PARAMETERS * ndis_type: NDIS_OBJTYPE_RSS_PARAMS */ struct ndis_rss_params { struct ndis_object_hdr ndis_hdr; uint16_t ndis_flags; /* NDIS_RSS_FLAG_ */ uint16_t ndis_bcpu; /* base cpu 0 */ uint32_t ndis_hash; /* NDIS_HASH_ */ uint16_t ndis_indsize; /* indirect table */ uint32_t ndis_indoffset; uint16_t ndis_keysize; /* hash key */ uint32_t ndis_keyoffset; /* NDIS >= 6.20 */ uint32_t ndis_cpumaskoffset; uint32_t ndis_cpumaskcnt; uint32_t ndis_cpumaskentsz; }; #define NDIS_RSS_PARAMS_SIZE sizeof(struct ndis_rss_params) #define NDIS_RSS_PARAMS_SIZE_6_0 \ __offsetof(struct ndis_rss_params, ndis_cpumaskoffset) #define NDIS_RSS_PARAMS_REV_1 1 /* NDIS 6.0 */ #define NDIS_RSS_PARAMS_REV_2 2 /* NDIS 6.20 */ #define NDIS_RSS_FLAG_NONE 0x0000 #define NDIS_RSS_FLAG_BCPU_UNCHG 0x0001 #define NDIS_RSS_FLAG_HASH_UNCHG 0x0002 #define NDIS_RSS_FLAG_IND_UNCHG 0x0004 #define NDIS_RSS_FLAG_KEY_UNCHG 0x0008 #define NDIS_RSS_FLAG_DISABLE 0x0010 /* non-standard convenient struct */ struct ndis_rssprm_toeplitz { struct ndis_rss_params rss_params; /* Toeplitz hash key */ uint8_t rss_key[NDIS_HASH_KEYSIZE_TOEPLITZ]; /* Indirect table */ uint32_t rss_ind[NDIS_HASH_INDCNT]; }; + +#define NDIS_RSSPRM_TOEPLITZ_SIZE(nind) \ + __offsetof(struct ndis_rssprm_toeplitz, rss_ind[nind]) /* * OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES * ndis_type: NDIS_OBJTYPE_OFFLOAD */ #define NDIS_OFFLOAD_ENCAP_NONE 0x0000 #define NDIS_OFFLOAD_ENCAP_NULL 0x0001 #define NDIS_OFFLOAD_ENCAP_8023 0x0002 #define NDIS_OFFLOAD_ENCAP_8023PQ 0x0004 #define NDIS_OFFLOAD_ENCAP_8023PQ_OOB 0x0008 #define NDIS_OFFLOAD_ENCAP_RFC1483 0x0010 struct ndis_csum_offload { uint32_t ndis_ip4_txenc; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip4_txcsum; #define NDIS_TXCSUM_CAP_IP4OPT 0x001 #define NDIS_TXCSUM_CAP_TCP4OPT 0x004 #define NDIS_TXCSUM_CAP_TCP4 0x010 #define NDIS_TXCSUM_CAP_UDP4 0x040 #define NDIS_TXCSUM_CAP_IP4 0x100 uint32_t ndis_ip4_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip4_rxcsum; #define NDIS_RXCSUM_CAP_IP4OPT 0x001 #define NDIS_RXCSUM_CAP_TCP4OPT 0x004 #define NDIS_RXCSUM_CAP_TCP4 0x010 #define NDIS_RXCSUM_CAP_UDP4 0x040 #define NDIS_RXCSUM_CAP_IP4 0x100 uint32_t ndis_ip6_txenc; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip6_txcsum; #define NDIS_TXCSUM_CAP_IP6EXT 0x001 #define NDIS_TXCSUM_CAP_TCP6OPT 0x004 #define NDIS_TXCSUM_CAP_TCP6 0x010 #define NDIS_TXCSUM_CAP_UDP6 0x040 uint32_t ndis_ip6_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip6_rxcsum; #define NDIS_RXCSUM_CAP_IP6EXT 0x001 #define NDIS_RXCSUM_CAP_TCP6OPT 0x004 #define NDIS_RXCSUM_CAP_TCP6 0x010 #define NDIS_RXCSUM_CAP_UDP6 0x040 }; struct ndis_lsov1_offload { uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_maxsize; uint32_t ndis_minsegs; uint32_t ndis_opts; }; struct ndis_ipsecv1_offload { uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ah_esp; uint32_t ndis_xport_tun; uint32_t ndis_ip4_opts; uint32_t ndis_flags; uint32_t ndis_ip4_ah; uint32_t ndis_ip4_esp; }; struct ndis_lsov2_offload { uint32_t ndis_ip4_encap; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip4_maxsz; uint32_t ndis_ip4_minsg; uint32_t ndis_ip6_encap; /*NDIS_OFFLOAD_ENCAP_*/ uint32_t ndis_ip6_maxsz; uint32_t ndis_ip6_minsg; uint32_t ndis_ip6_opts; #define NDIS_LSOV2_CAP_IP6EXT 0x001 #define NDIS_LSOV2_CAP_TCP6OPT 0x004 }; struct ndis_ipsecv2_offload { uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ uint16_t ndis_ip6; uint16_t ndis_ip4opt; uint16_t ndis_ip6ext; uint16_t ndis_ah; uint16_t ndis_esp; uint16_t ndis_ah_esp; uint16_t ndis_xport; uint16_t ndis_tun; uint16_t ndis_xport_tun; uint16_t ndis_lso; uint16_t ndis_extseq; uint32_t ndis_udp_esp; uint32_t ndis_auth; uint32_t ndis_crypto; uint32_t ndis_sa_caps; }; struct ndis_rsc_offload { uint16_t ndis_ip4; uint16_t ndis_ip6; }; struct ndis_encap_offload { uint32_t ndis_flags; uint32_t ndis_maxhdr; }; struct ndis_offload { struct ndis_object_hdr ndis_hdr; struct ndis_csum_offload ndis_csum; struct ndis_lsov1_offload ndis_lsov1; struct ndis_ipsecv1_offload ndis_ipsecv1; struct ndis_lsov2_offload ndis_lsov2; uint32_t ndis_flags; /* NDIS >= 6.1 */ struct ndis_ipsecv2_offload ndis_ipsecv2; /* NDIS >= 6.30 */ struct ndis_rsc_offload ndis_rsc; struct ndis_encap_offload ndis_encap_gre; }; #define NDIS_OFFLOAD_SIZE sizeof(struct ndis_offload) #define NDIS_OFFLOAD_SIZE_6_0 \ __offsetof(struct ndis_offload, ndis_ipsecv2) #define NDIS_OFFLOAD_SIZE_6_1 \ __offsetof(struct ndis_offload, ndis_rsc) #define NDIS_OFFLOAD_REV_1 1 /* NDIS 6.0 */ #define NDIS_OFFLOAD_REV_2 2 /* NDIS 6.1 */ #define NDIS_OFFLOAD_REV_3 3 /* NDIS 6.30 */ /* * Per-packet-info */ /* VLAN */ #define NDIS_VLAN_INFO_SIZE sizeof(uint32_t) #define NDIS_VLAN_INFO_PRI_MASK 0x0007 #define NDIS_VLAN_INFO_CFI_MASK 0x0008 #define NDIS_VLAN_INFO_ID_MASK 0xfff0 #define NDIS_VLAN_INFO_MAKE(id, pri, cfi) \ (((pri) & NDIS_VLAN_INFO_PRI_MASK) | \ (((cfi) & 0x1) << 3) | (((id) & 0xfff) << 4)) #define NDIS_VLAN_INFO_ID(inf) (((inf) & NDIS_VLAN_INFO_ID_MASK) >> 4) #define NDIS_VLAN_INFO_CFI(inf) (((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3) #define NDIS_VLAN_INFO_PRI(inf) ((inf) & NDIS_VLAN_INFO_PRI_MASK) /* Reception checksum */ #define NDIS_RXCSUM_INFO_SIZE sizeof(uint32_t) #define NDIS_RXCSUM_INFO_TCPCS_FAILED 0x0001 #define NDIS_RXCSUM_INFO_UDPCS_FAILED 0x0002 #define NDIS_RXCSUM_INFO_IPCS_FAILED 0x0004 #define NDIS_RXCSUM_INFO_TCPCS_OK 0x0008 #define NDIS_RXCSUM_INFO_UDPCS_OK 0x0010 #define NDIS_RXCSUM_INFO_IPCS_OK 0x0020 #define NDIS_RXCSUM_INFO_LOOPBACK 0x0040 #define NDIS_RXCSUM_INFO_TCPCS_INVAL 0x0080 #define NDIS_RXCSUM_INFO_IPCS_INVAL 0x0100 /* LSOv2 */ #define NDIS_LSO2_INFO_SIZE sizeof(uint32_t) #define NDIS_LSO2_INFO_MSS_MASK 0x000fffff #define NDIS_LSO2_INFO_THOFF_MASK 0x3ff00000 #define NDIS_LSO2_INFO_ISLSO2 0x40000000 #define NDIS_LSO2_INFO_ISIPV6 0x80000000 #define NDIS_LSO2_INFO_MAKE(thoff, mss) \ ((((uint32_t)(mss)) & NDIS_LSO2_INFO_MSS_MASK) | \ ((((uint32_t)(thoff)) & 0x3ff) << 20) | \ NDIS_LSO2_INFO_ISLSO2) #define NDIS_LSO2_INFO_MAKEIPV4(thoff, mss) \ NDIS_LSO2_INFO_MAKE((thoff), (mss)) #define NDIS_LSO2_INFO_MAKEIPV6(thoff, mss) \ (NDIS_LSO2_INFO_MAKE((thoff), (mss)) | NDIS_LSO2_INFO_ISIPV6) /* Transmission checksum */ #define NDIS_TXCSUM_INFO_SIZE sizeof(uint32_t) #define NDIS_TXCSUM_INFO_IPV4 0x00000001 #define NDIS_TXCSUM_INFO_IPV6 0x00000002 #define NDIS_TXCSUM_INFO_TCPCS 0x00000004 #define NDIS_TXCSUM_INFO_UDPCS 0x00000008 #define NDIS_TXCSUM_INFO_IPCS 0x00000010 #define NDIS_TXCSUM_INFO_THOFF 0x03ff0000 #endif /* !_NET_NDIS_H_ */ Index: user/alc/PQ_LAUNDRY/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/mlx4/mlx4_en/mlx4_en_tx.c (revision 308054) @@ -1,1119 +1,1118 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define LINUXKPI_PARAM_PREFIX mlx4_ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "en.h" enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ MAX_BF = 256, MIN_PKT_LEN = 17, }; static int inline_thold __read_mostly = MAX_INLINE; module_param_named(inline_thold, inline_thold, uint, 0444); MODULE_PARM_DESC(inline_thold, "threshold for using inline data"); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring, u32 size, u16 stride, int node, int queue_idx) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; uint32_t x; int tmp; int err; ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node); if (!ring) { ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed allocating TX ring\n"); return -ENOMEM; } } /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX4_EN_TX_MAX_PAYLOAD_SIZE, /* maxsize */ MLX4_EN_TX_MAX_MBUF_FRAGS, /* nsegments */ MLX4_EN_TX_MAX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &ring->dma_tag))) goto done; ring->size = size; ring->size_mask = size - 1; ring->stride = stride; ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE)); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); /* Allocate the buf ring */ ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF, M_WAITOK, &ring->tx_lock.m); if (ring->br == NULL) { en_err(priv, "Failed allocating tx_info ring\n"); err = -ENOMEM; goto err_free_dma_tag; } tmp = size * sizeof(struct mlx4_en_tx_info); ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node); if (!ring->tx_info) { ring->tx_info = kzalloc(tmp, GFP_KERNEL); if (!ring->tx_info) { err = -ENOMEM; goto err_ring; } } /* Create DMA descriptor MAPs */ for (x = 0; x != size; x++) { err = -bus_dmamap_create(ring->dma_tag, 0, &ring->tx_info[x].dma_map); if (err != 0) { while (x--) { bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); } goto err_info; } } en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", ring->tx_info, tmp); ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) { en_err(priv, "Failed allocating hwq resources\n"); goto err_dma_map; } err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { en_err(priv, "Failed to map TX buffer\n"); goto err_hwq_res; } ring->buf = ring->wqres.buf.direct.buf; en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d " "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, MLX4_RESERVE_BF_QP); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_map; } err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; } ring->qp.event = mlx4_en_sqp_event; err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); if (err) { en_dbg(DRV, priv, "working without blueflame (%d)", err); ring->bf.uar = &mdev->priv_uar; ring->bf.uar->map = mdev->uar_map; ring->bf_enabled = false; } else ring->bf_enabled = true; ring->queue_index = queue_idx; if (queue_idx < priv->num_tx_rings_p_up ) CPU_SET(queue_idx, &ring->affinity_mask); *pring = ring; return 0; err_reserve: mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_dma_map: for (x = 0; x != size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); err_info: vfree(ring->tx_info); err_ring: buf_ring_free(ring->br, M_DEVBUF); err_free_dma_tag: bus_dma_tag_destroy(ring->dma_tag); done: kfree(ring); return err; } void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring = *pring; uint32_t x; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); buf_ring_free(ring->br, M_DEVBUF); if (ring->bf_enabled) mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); mlx4_qp_free(mdev->dev, &ring->qp); mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); for (x = 0; x != ring->size; x++) bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); vfree(ring->tx_info); mtx_destroy(&ring->tx_lock.m); mtx_destroy(&ring->comp_lock.m); bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; } int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio) { struct mlx4_en_dev *mdev = priv->mdev; int err; ring->cqn = cq; ring->prod = 0; ring->cons = 0xffffffff; ring->last_nr_txbb = 1; ring->poll_cnt = 0; ring->blocked = 0; memset(ring->buf, 0, ring->buf_size); ring->qp_state = MLX4_QP_STATE_RST; ring->doorbell_qpn = ring->qp.qpn << 8; mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, user_prio, &ring->context); if (ring->bf_enabled) ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, &ring->qp, &ring->qp_state); return err; } void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring) { struct mlx4_en_dev *mdev = priv->mdev; mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); /* copy data into place */ m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); return (dseg); } static void mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { } static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index, u8 owner) { struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *) (ring->buf + (index * TXBB_SIZE)); volatile __be32 *ptr = (__be32 *)tx_desc; const __be32 stamp = cpu_to_be32(STAMP_VAL | ((u32)owner << STAMP_SHIFT)); u32 i; /* Stamp the freed descriptor */ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { *ptr = stamp; ptr += STAMP_DWORDS; } } static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, u32 index) { struct mlx4_en_tx_info *tx_info; struct mbuf *mb; tx_info = &ring->tx_info[index]; mb = tx_info->mb; if (mb == NULL) goto done; bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ring->dma_tag, tx_info->dma_map); m_freem(mb); done: return (tx_info->nr_txbb); } int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) { struct mlx4_en_priv *priv = netdev_priv(dev); int cnt = 0; /* Skip last polled descriptor */ ring->cons += ring->last_nr_txbb; en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n", ring->cons, ring->prod); if ((u32) (ring->prod - ring->cons) > ring->size) { en_warn(priv, "Tx consumer passed producer!\n"); return 0; } while (ring->cons != ring->prod) { ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, ring->cons & ring->size_mask); ring->cons += ring->last_nr_txbb; cnt++; } if (cnt) en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt); return cnt; } static bool mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring) { int wqs; wqs = ring->size - (ring->prod - ring->cons); return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS))); } static int mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; struct mlx4_cqe *cqe; u16 index; u16 new_index, ring_index, stamp_index; u32 txbbs_skipped = 0; u32 txbbs_stamp = 0; u32 cons_index = mcq->cons_index; int size = cq->size; u32 size_mask = ring->size_mask; struct mlx4_cqe *buf = cq->buf; int factor = priv->cqe_factor; if (!priv->port_up) return 0; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; ring_index = ring->cons & size_mask; stamp_index = ring_index; /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { /* * make sure we read the CQE after we read the * ownership bit */ rmb(); if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n", ((struct mlx4_err_cqe *)cqe)-> vendor_err_syndrome, ((struct mlx4_err_cqe *)cqe)->syndrome); } /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; do { txbbs_skipped += ring->last_nr_txbb; ring_index = (ring_index + ring->last_nr_txbb) & size_mask; /* free next descriptor */ ring->last_nr_txbb = mlx4_en_free_tx_desc( priv, ring, ring_index); mlx4_en_stamp_wqe(priv, ring, stamp_index, !!((ring->cons + txbbs_stamp) & ring->size)); stamp_index = ring_index; txbbs_stamp = txbbs_skipped; } while (ring_index != new_index); ++cons_index; index = cons_index & size_mask; cqe = &buf[(index << factor) + factor]; } /* * To prevent CQ overflow we first update CQ consumer and only then * the ring consumer. */ mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); ring->cons += txbbs_skipped; /* Wakeup Tx queue if it was stopped and ring is not full */ if (unlikely(ring->blocked) && !mlx4_en_tx_ring_is_full(ring)) { ring->blocked = 0; if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); ring->wake_queue++; priv->port_stats.wake_queue++; } return (0); } void mlx4_en_tx_irq(struct mlx4_cq *mcq) { struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; if (priv->port_up == 0 || !spin_trylock(&ring->comp_lock)) return; mlx4_en_process_tx_cq(cq->dev, cq); mod_timer(&cq->timer, jiffies + 1); spin_unlock(&ring->comp_lock); } void mlx4_en_poll_tx_cq(unsigned long data) { struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data; struct mlx4_en_priv *priv = netdev_priv(cq->dev); struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; u32 inflight; INC_PERF_COUNTER(priv->pstats.tx_poll); if (priv->port_up == 0) return; if (!spin_trylock(&ring->comp_lock)) { mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); return; } mlx4_en_process_tx_cq(cq->dev, cq); inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight && priv->port_up) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); spin_unlock(&ring->comp_lock); } static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = priv->tx_cq[tx_ind]; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; if (priv->port_up == 0) return; /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!timer_pending(&cq->timer)) mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT); /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) if (spin_trylock(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); spin_unlock(&ring->comp_lock); } } static u16 mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb) { u16 retval; /* only copy from first fragment, if possible */ retval = MIN(ring->inline_thold, mb->m_len); /* check for too little data */ if (unlikely(retval < MIN_PKT_LEN)) retval = MIN(ring->inline_thold, mb->m_pkthdr.len); return (retval); } static int mlx4_en_get_header_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; struct ip *ip; int ip_hlen, tcp_hlen; struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, struct ether_vlan_header *); if (mb->m_len < ETHER_HDR_LEN) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } if (mb->m_len < eth_hdr_len) return (0); switch (eth_type) { case ETHERTYPE_IP: ip = (struct ip *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip)) return (0); if (ip->ip_p != IPPROTO_TCP) return (0); ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip6)) return (0); if (ip6->ip6_nxt != IPPROTO_TCP) return (0); eth_hdr_len += sizeof(*ip6); break; default: return (0); } if (mb->m_len < eth_hdr_len + sizeof(*th)) return (0); th = (struct tcphdr *)(mb->m_data + eth_hdr_len); tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; if (mb->m_len < eth_hdr_len) return (0); return (eth_hdr_len); } static volatile struct mlx4_wqe_data_seg * mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg, struct mbuf *mb, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { m_copydata(mb, 0, len, inl + 4); memset(inl + 4 + len, 0, MIN_PKT_LEN - len); dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT); } else if (len <= spc) { m_copydata(mb, 0, len, inl + 4); dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); } else { m_copydata(mb, 0, spc, inl + 4); m_copydata(mb, spc, len - spc, inl + 8 + spc); dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT); } return (dseg); } static void mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg, int len, __be32 owner_bit) { uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; if (unlikely(len < MIN_PKT_LEN)) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | MIN_PKT_LEN); } else if (len <= spc) { *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | len); } else { *(volatile uint32_t *)(inl + 4 + spc) = SET_BYTE_COUNT((1 << 31) | (len - spc)); wmb(); *(volatile uint32_t *)inl = SET_BYTE_COUNT((1 << 31) | spc); } } static uint32_t hashrandom; static void hashrandom_init(void *arg) { /* * It is assumed that the random subsystem has been * initialized when this function is called: */ hashrandom = m_ether_tcpip_hash_init(); } SYSINIT(hashrandom_init, SI_SUB_RANDOM, SI_ORDER_ANY, &hashrandom_init, NULL); u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb) { struct mlx4_en_priv *priv = netdev_priv(dev); u32 rings_p_up = priv->num_tx_rings_p_up; u32 up = 0; u32 queue_index; #if (MLX4_EN_NUM_UP > 1) /* Obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { u32 vlan_tag = mb->m_pkthdr.ether_vtag; up = (vlan_tag >> 13) % MLX4_EN_NUM_UP; } #endif queue_index = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4, mb, hashrandom); return ((queue_index % rings_p_up) + (up * rings_p_up)); } static void mlx4_bf_copy(void __iomem *dst, volatile unsigned long *src, unsigned bytecnt) { __iowrite64_copy(dst, __DEVOLATILE(void *, src), bytecnt / 8); } static u64 mlx4_en_mac_to_u64(u8 *addr) { u64 mac = 0; int i; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac <<= 8; mac |= addr[i]; } return mac; } static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp) { enum { DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT, CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICITED), }; bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS]; volatile struct mlx4_wqe_data_seg *dseg; volatile struct mlx4_wqe_data_seg *dseg_inline; volatile struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; struct ifnet *ifp = priv->dev; struct mlx4_en_tx_info *tx_info; struct mbuf *mb = *mbp; struct mbuf *m; __be32 owner_bit; int nr_segs; int pad; int err; u32 bf_size; u32 bf_prod; u32 opcode; u16 index; u16 ds_cnt; u16 ihs; if (unlikely(!priv->port_up)) { err = EINVAL; goto tx_drop; } /* check if TX ring is full */ if (unlikely(mlx4_en_tx_ring_is_full(ring))) { - /* every full native Tx ring stops queue */ - if (ring->blocked == 0) - atomic_add_int(&priv->blocked, 1); - /* Set HW-queue-is-full flag */ - atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); - priv->port_stats.queue_stopped++; - ring->blocked = 1; + /* every full native Tx ring stops queue */ + if (ring->blocked == 0) + atomic_add_int(&priv->blocked, 1); + /* Set HW-queue-is-full flag */ + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); priv->port_stats.queue_stopped++; + ring->blocked = 1; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]); return (ENOBUFS); - } + } /* sanity check we are not wrapping around */ KASSERT(((~ring->prod) & ring->size_mask) >= (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring")); /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, (u32) (ring->prod - ring->cons - 1)); /* Track current mbuf packet header length */ AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); /* Grab an index and try to transmit packet */ owner_bit = (ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0; index = ring->prod & ring->size_mask; tx_desc = (volatile struct mlx4_en_tx_desc *) (ring->buf + index * TXBB_SIZE); tx_info = &ring->tx_info[index]; dseg = &tx_desc->data; /* send a copy of the frame to the BPF listener, if any */ if (ifp != NULL && ifp->if_bpf != NULL) ETHER_BPF_MTAP(ifp, mb); /* get default flags */ tx_desc->ctrl.srcrb_flags = CTRL_FLAGS; if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); /* do statistics */ if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) { priv->port_stats.tx_chksum_offload++; ring->tx_csum++; } /* check for VLAN tag */ if (mb->m_flags & M_VLANTAG) { tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN; } else { tx_desc->ctrl.vlan_tag = 0; tx_desc->ctrl.ins_vlan = 0; } /* clear immediate field */ tx_desc->ctrl.imm = 0; /* Handle LSO (TSO) packets */ if (mb->m_pkthdr.csum_flags & CSUM_TSO) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) | owner_bit; ihs = mlx4_en_get_header_size(mb); if (unlikely(ihs > MAX_INLINE)) { ring->oversized_packets++; err = EINVAL; goto tx_drop; } tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs); payload_len = mb->m_pkthdr.len - ihs; if (unlikely(payload_len == 0)) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); ring->bytes += payload_len + (num_pkts * ihs); ring->packets += num_pkts; priv->port_stats.tso_packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_lso_data(dseg, mb, ihs, owner_bit); } else { opcode = cpu_to_be32(MLX4_OPCODE_SEND) | owner_bit; ihs = mlx4_en_get_inline_hdr_size(ring, mb); ring->bytes += max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); ring->packets++; /* store pointer to inline header */ dseg_inline = dseg; /* copy data inline */ dseg = mlx4_en_store_inline_data(dseg, mb, ihs, owner_bit); } m_adj(mb, ihs); /* trim off empty mbufs */ while (mb->m_len == 0) { mb = m_free(mb); /* check if all data has been inlined */ if (mb == NULL) { nr_segs = 0; goto skip_dma; } } err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); if (unlikely(err == EFBIG)) { /* Too many mbuf fragments */ m = m_defrag(mb, M_NOWAIT); if (m == NULL) { ring->oversized_packets++; goto tx_drop; } mb = m; /* Try again */ err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, mb, segs, &nr_segs, BUS_DMA_NOWAIT); } /* catch errors */ if (unlikely(err != 0)) { ring->oversized_packets++; goto tx_drop; } /* make sure all mbuf data is written to RAM */ bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, BUS_DMASYNC_PREWRITE); skip_dma: /* compute number of DS needed */ ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs; /* * Check if the next request can wrap around and fill the end * of the current request with zero immediate data: */ pad = DIV_ROUND_UP(ds_cnt, DS_FACT); pad = (~(ring->prod + pad)) & ring->size_mask; if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) { /* * Compute the least number of DS blocks we need to * pad in order to achieve a TX ring wraparound: */ pad = (DS_FACT * (pad + 1)); } else { /* * The hardware will automatically jump to the next * TXBB. No need for padding. */ pad = 0; } /* compute total number of DS blocks */ ds_cnt += pad; /* * When modifying this code, please ensure that the following * computation is always less than or equal to 0x3F: * * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) + * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT) * * Else the "ds_cnt" variable can become too big. */ tx_desc->ctrl.fence_size = (ds_cnt & 0x3f); /* store pointer to mbuf */ tx_info->mb = mb; tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT); bf_size = ds_cnt * DS_SIZE_ALIGNMENT; bf_prod = ring->prod; /* compute end of "dseg" array */ dseg += nr_segs + pad; /* pad using zero immediate dseg */ while (pad--) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); } /* fill segment list */ while (nr_segs--) { if (unlikely(segs[nr_segs].ds_len == 0)) { dseg--; dseg->addr = 0; dseg->lkey = 0; wmb(); dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); } else { dseg--; dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr); dseg->lkey = cpu_to_be32(priv->mdev->mr.key); wmb(); dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len); } } wmb(); /* write owner bits in reverse order */ if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO)) mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit); else mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit); if (unlikely(priv->validate_loopback)) { /* Copy dst mac address to wqe */ struct ether_header *ethh; u64 mac; u32 mac_l, mac_h; ethh = mtod(mb, struct ether_header *); mac = mlx4_en_mac_to_u64(ethh->ether_dhost); if (mac) { mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16); mac_l = (u32) (mac & 0xffffffff); tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h); tx_desc->ctrl.imm = cpu_to_be32(mac_l); } } /* update producer counter */ ring->prod += tx_info->nr_txbb; if (ring->bf_enabled && bf_size <= MAX_BF && (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) { /* store doorbell number */ *(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); /* or in producer number for this WQE */ opcode |= cpu_to_be32((bf_prod & 0xffff) << 8); /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset, (volatile unsigned long *) &tx_desc->ctrl, bf_size); wmb(); ring->bf.offset ^= ring->bf.buf_size; } else { /* * Ensure the new descriptor hits memory before * setting ownership of this descriptor to HW: */ wmb(); tx_desc->ctrl.owner_opcode = opcode; wmb(); writel(cpu_to_be32(ring->doorbell_qpn), ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL); } return (0); tx_drop: *mbp = NULL; m_freem(mb); return (err); } static int mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *next; int enqueued, err = 0; ring = priv->tx_ring[tx_ind]; if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || priv->port_up == 0) { if (m != NULL) err = drbr_enqueue(dev, ring->br, m); return (err); } enqueued = 0; if (m != NULL) /* * If we can't insert mbuf into drbr, try to xmit anyway. * We keep the error we got so we could return that after xmit. */ err = drbr_enqueue(dev, ring->br, m); /* Process the queue */ while ((next = drbr_peek(dev, ring->br)) != NULL) { if (mlx4_en_xmit(priv, tx_ind, &next) != 0) { if (next == NULL) { drbr_advance(dev, ring->br); } else { drbr_putback(dev, ring->br, next); } break; } drbr_advance(dev, ring->br); enqueued++; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (enqueued > 0) ring->watchdog_time = ticks; return (err); } void mlx4_en_tx_que(void *context, int pending) { struct mlx4_en_tx_ring *ring; struct mlx4_en_priv *priv; struct net_device *dev; struct mlx4_en_cq *cq; int tx_ind; cq = context; dev = cq->dev; priv = dev->if_softc; tx_ind = cq->ring; ring = priv->tx_ring[tx_ind]; if (priv->port_up != 0 && (dev->if_drv_flags & IFF_DRV_RUNNING) != 0) { mlx4_en_xmit_poll(priv, tx_ind); spin_lock(&ring->tx_lock); if (!drbr_empty(dev, ring->br)) mlx4_en_transmit_locked(dev, tx_ind, NULL); spin_unlock(&ring->tx_lock); } } int mlx4_en_transmit(struct ifnet *dev, struct mbuf *m) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mlx4_en_cq *cq; int i, err = 0; if (priv->port_up == 0) { m_freem(m); return (ENETDOWN); } /* Compute which queue to use */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { i = (m->m_pkthdr.flowid % 128) % priv->tx_ring_num; } else { i = mlx4_en_select_queue(dev, m); } ring = priv->tx_ring[i]; if (spin_trylock(&ring->tx_lock)) { err = mlx4_en_transmit_locked(dev, i, m); spin_unlock(&ring->tx_lock); /* Poll CQ here */ mlx4_en_xmit_poll(priv, i); } else { err = drbr_enqueue(dev, ring->br, m); cq = priv->tx_cq[i]; taskqueue_enqueue(cq->tq, &cq->cq_task); } return (err); } /* * Flush ring buffers. */ void mlx4_en_qflush(struct ifnet *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_tx_ring *ring; struct mbuf *m; if (priv->port_up == 0) return; for (int i = 0; i < priv->tx_ring_num; i++) { ring = priv->tx_ring[i]; spin_lock(&ring->tx_lock); while ((m = buf_ring_dequeue_sc(ring->br)) != NULL) m_freem(m); spin_unlock(&ring->tx_lock); } if_qflush(dev); } Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/if_em_netmap.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/if_em_netmap.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/if_em_netmap.h (revision 308054) @@ -1,329 +1,329 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * netmap support for: em. * * For more details on netmap support please see ixgbe_netmap.h */ #include #include #include #include /* vtophys ? */ #include // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) { if (adapter->msix > 1) { /* MSIX */ int i; struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { taskqueue_block(txr->tq); taskqueue_drain(txr->tq, &txr->tx_task); taskqueue_block(rxr->tq); taskqueue_drain(rxr->tq, &rxr->rx_task); } } else { /* legacy */ taskqueue_block(adapter->tq); taskqueue_drain(adapter->tq, &adapter->link_task); taskqueue_drain(adapter->tq, &adapter->que_task); } } static void em_netmap_unblock_tasks(struct adapter *adapter) { if (adapter->msix > 1) { struct tx_ring *txr = adapter->tx_rings; struct rx_ring *rxr = adapter->rx_rings; int i; for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { taskqueue_unblock(txr->tq); taskqueue_unblock(rxr->tq); } } else { /* legacy */ taskqueue_unblock(adapter->tq); } } /* * Register/unregister. We are already under netmap lock. */ static int em_netmap_reg(struct netmap_adapter *na, int onoff) { struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; EM_CORE_LOCK(adapter); em_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); em_netmap_block_tasks(adapter); /* enable or disable flags and callbacks in na and ifp */ if (onoff) { nm_set_native_flags(na); } else { nm_clear_native_flags(na); } em_init_locked(adapter); /* also enable intr */ em_netmap_unblock_tasks(adapter); EM_CORE_UNLOCK(adapter); return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. */ static int em_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ u_int n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* * First part: process new packets to send. */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; void *addr = PNMB(na, slot, &paddr); /* device-specific */ struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; struct em_txbuffer *txbuf = &txr->tx_buffers[nic_i]; int flags = (slot->flags & NS_REPORT || nic_i == 0 || nic_i == report_frequency) ? E1000_TXD_CMD_RS : 0; NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ netmap_reload_map(na, txr->txtag, txbuf->map, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); /* Fill the slot in the NIC ring. */ curr->upper.data = 0; curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* (re)start the tx unit up to slot nic_i (excluded) */ E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } /* * Second part: reclaim buffers for completed transmissions. */ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } if (nic_i != txr->next_to_clean) { txr->next_to_clean = nic_i; kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } return 0; } /* * Reconcile kernel and user view of the receive ring. */ static int em_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ u_int n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* * First part: import newly received packets. */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; nic_i = rxr->next_to_check; nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { // XXX no need to count union e1000_rx_desc_extended *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; ring->slot[nm_i].len = le16toh(curr->wb.upper.length); ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); nm_i = nm_next(nm_i, lim); /* make sure next_to_refresh follows next_to_check */ rxr->next_to_refresh = nic_i; // XXX nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ rxr->next_to_check = nic_i; kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(na, slot, &paddr); union e1000_rx_desc_extended *curr = &rxr->rx_base[nic_i]; struct em_rxbuffer *rxbuf = &rxr->rx_buffers[nic_i]; if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; + curr->read.buffer_addr = htole64(paddr); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - curr->read.buffer_addr = htole64(paddr); netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->wb.upper.status_error = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } return 0; ring_reset: return netmap_ring_reinit(kring); } static void em_netmap_attach(struct adapter *adapter) { struct netmap_adapter na; bzero(&na, sizeof(na)); na.ifp = adapter->ifp; na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; na.nm_register = em_netmap_reg; na.num_tx_rings = na.num_rx_rings = adapter->num_queues; netmap_attach(&na); } /* end of file */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/if_ptnet.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/if_ptnet.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/if_ptnet.c (revision 308054) @@ -1,2283 +1,2276 @@ /*- * Copyright (c) 2016, Vincenzo Maffione * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* Driver for ptnet paravirtualized network device. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #ifndef PTNET_CSB_ALLOC #error "No support for on-device CSB" #endif #ifndef INET #error "INET not defined, cannot support offloadings" #endif #if __FreeBSD_version >= 1100000 static uint64_t ptnet_get_counter(if_t, ift_counter); #else typedef struct ifnet *if_t; #define if_getsoftc(_ifp) (_ifp)->if_softc #endif //#define PTNETMAP_STATS //#define DEBUG #ifdef DEBUG #define DBG(x) x #else /* !DEBUG */ #define DBG(x) #endif /* !DEBUG */ extern int ptnet_vnet_hdr; /* Tunable parameter */ struct ptnet_softc; struct ptnet_queue_stats { uint64_t packets; /* if_[io]packets */ uint64_t bytes; /* if_[io]bytes */ uint64_t errors; /* if_[io]errors */ uint64_t iqdrops; /* if_iqdrops */ uint64_t mcasts; /* if_[io]mcasts */ #ifdef PTNETMAP_STATS uint64_t intrs; uint64_t kicks; #endif /* PTNETMAP_STATS */ }; struct ptnet_queue { struct ptnet_softc *sc; struct resource *irq; void *cookie; int kring_id; struct ptnet_ring *ptring; unsigned int kick; struct mtx lock; struct buf_ring *bufring; /* for TX queues */ struct ptnet_queue_stats stats; #ifdef PTNETMAP_STATS struct ptnet_queue_stats last_stats; #endif /* PTNETMAP_STATS */ struct taskqueue *taskq; struct task task; char lock_name[16]; }; #define PTNET_Q_LOCK(_pq) mtx_lock(&(_pq)->lock) #define PTNET_Q_TRYLOCK(_pq) mtx_trylock(&(_pq)->lock) #define PTNET_Q_UNLOCK(_pq) mtx_unlock(&(_pq)->lock) struct ptnet_softc { device_t dev; if_t ifp; struct ifmedia media; struct mtx lock; char lock_name[16]; char hwaddr[ETHER_ADDR_LEN]; /* Mirror of PTFEAT register. */ uint32_t ptfeatures; unsigned int vnet_hdr_len; /* PCI BARs support. */ struct resource *iomem; struct resource *msix_mem; unsigned int num_rings; unsigned int num_tx_rings; struct ptnet_queue *queues; struct ptnet_queue *rxqueues; struct ptnet_csb *csb; unsigned int min_tx_space; struct netmap_pt_guest_adapter *ptna; struct callout tick; #ifdef PTNETMAP_STATS struct timeval last_ts; #endif /* PTNETMAP_STATS */ }; #define PTNET_CORE_LOCK(_sc) mtx_lock(&(_sc)->lock) #define PTNET_CORE_UNLOCK(_sc) mtx_unlock(&(_sc)->lock) static int ptnet_probe(device_t); static int ptnet_attach(device_t); static int ptnet_detach(device_t); static int ptnet_suspend(device_t); static int ptnet_resume(device_t); static int ptnet_shutdown(device_t); static void ptnet_init(void *opaque); static int ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data); static int ptnet_init_locked(struct ptnet_softc *sc); static int ptnet_stop(struct ptnet_softc *sc); static int ptnet_transmit(if_t ifp, struct mbuf *m); static int ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget, bool may_resched); static void ptnet_qflush(if_t ifp); static void ptnet_tx_task(void *context, int pending); static int ptnet_media_change(if_t ifp); static void ptnet_media_status(if_t ifp, struct ifmediareq *ifmr); #ifdef PTNETMAP_STATS static void ptnet_tick(void *opaque); #endif static int ptnet_irqs_init(struct ptnet_softc *sc); static void ptnet_irqs_fini(struct ptnet_softc *sc); static uint32_t ptnet_nm_ptctl(if_t ifp, uint32_t cmd); static int ptnet_nm_config(struct netmap_adapter *na, unsigned *txr, unsigned *txd, unsigned *rxr, unsigned *rxd); static void ptnet_update_vnet_hdr(struct ptnet_softc *sc); static int ptnet_nm_register(struct netmap_adapter *na, int onoff); static int ptnet_nm_txsync(struct netmap_kring *kring, int flags); static int ptnet_nm_rxsync(struct netmap_kring *kring, int flags); static void ptnet_tx_intr(void *opaque); static void ptnet_rx_intr(void *opaque); static unsigned ptnet_rx_discard(struct netmap_kring *kring, unsigned int head); static int ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget, bool may_resched); static void ptnet_rx_task(void *context, int pending); #ifdef DEVICE_POLLING static poll_handler_t ptnet_poll; #endif static device_method_t ptnet_methods[] = { DEVMETHOD(device_probe, ptnet_probe), DEVMETHOD(device_attach, ptnet_attach), DEVMETHOD(device_detach, ptnet_detach), DEVMETHOD(device_suspend, ptnet_suspend), DEVMETHOD(device_resume, ptnet_resume), DEVMETHOD(device_shutdown, ptnet_shutdown), DEVMETHOD_END }; static driver_t ptnet_driver = { "ptnet", ptnet_methods, sizeof(struct ptnet_softc) }; /* We use (SI_ORDER_MIDDLE+2) here, see DEV_MODULE_ORDERED() invocation. */ static devclass_t ptnet_devclass; DRIVER_MODULE_ORDERED(ptnet, pci, ptnet_driver, ptnet_devclass, NULL, NULL, SI_ORDER_MIDDLE + 2); static int ptnet_probe(device_t dev) { if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID || pci_get_device(dev) != PTNETMAP_PCI_NETIF_ID) { return (ENXIO); } device_set_desc(dev, "ptnet network adapter"); return (BUS_PROBE_DEFAULT); } static inline void ptnet_kick(struct ptnet_queue *pq) { #ifdef PTNETMAP_STATS pq->stats.kicks ++; #endif /* PTNETMAP_STATS */ bus_write_4(pq->sc->iomem, pq->kick, 0); } #define PTNET_BUF_RING_SIZE 4096 #define PTNET_RX_BUDGET 512 #define PTNET_RX_BATCH 1 #define PTNET_TX_BUDGET 512 #define PTNET_TX_BATCH 64 #define PTNET_HDR_SIZE sizeof(struct virtio_net_hdr_mrg_rxbuf) #define PTNET_MAX_PKT_SIZE 65536 #define PTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP | CSUM_SCTP) #define PTNET_CSUM_OFFLOAD_IPV6 (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 |\ CSUM_SCTP_IPV6) #define PTNET_ALL_OFFLOAD (CSUM_TSO | PTNET_CSUM_OFFLOAD |\ PTNET_CSUM_OFFLOAD_IPV6) static int ptnet_attach(device_t dev) { - uint32_t ptfeatures = PTNETMAP_F_BASE; + uint32_t ptfeatures = 0; unsigned int num_rx_rings, num_tx_rings; struct netmap_adapter na_arg; unsigned int nifp_offset; struct ptnet_softc *sc; if_t ifp; uint32_t macreg; int err, rid; int i; sc = device_get_softc(dev); sc->dev = dev; /* Setup PCI resources. */ pci_enable_busmaster(dev); rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); sc->iomem = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (sc->iomem == NULL) { device_printf(dev, "Failed to map I/O BAR\n"); return (ENXIO); } - /* Check if we are supported by the hypervisor. If not, - * bail out immediately. */ + /* Negotiate features with the hypervisor. */ if (ptnet_vnet_hdr) { ptfeatures |= PTNETMAP_F_VNET_HDR; } bus_write_4(sc->iomem, PTNET_IO_PTFEAT, ptfeatures); /* wanted */ ptfeatures = bus_read_4(sc->iomem, PTNET_IO_PTFEAT); /* acked */ - if (!(ptfeatures & PTNETMAP_F_BASE)) { - device_printf(dev, "Hypervisor does not support netmap " - "passthorugh\n"); - err = ENXIO; - goto err_path; - } sc->ptfeatures = ptfeatures; /* Allocate CSB and carry out CSB allocation protocol (CSBBAH first, * then CSBBAL). */ sc->csb = malloc(sizeof(struct ptnet_csb), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->csb == NULL) { device_printf(dev, "Failed to allocate CSB\n"); err = ENOMEM; goto err_path; } { /* * We use uint64_t rather than vm_paddr_t since we * need 64 bit addresses even on 32 bit platforms. */ uint64_t paddr = vtophys(sc->csb); bus_write_4(sc->iomem, PTNET_IO_CSBBAH, (paddr >> 32) & 0xffffffff); bus_write_4(sc->iomem, PTNET_IO_CSBBAL, paddr & 0xffffffff); } num_tx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS); num_rx_rings = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS); sc->num_rings = num_tx_rings + num_rx_rings; sc->num_tx_rings = num_tx_rings; /* Allocate and initialize per-queue data structures. */ sc->queues = malloc(sizeof(struct ptnet_queue) * sc->num_rings, M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->queues == NULL) { err = ENOMEM; goto err_path; } sc->rxqueues = sc->queues + num_tx_rings; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; pq->sc = sc; pq->kring_id = i; pq->kick = PTNET_IO_KICK_BASE + 4 * i; pq->ptring = sc->csb->rings + i; snprintf(pq->lock_name, sizeof(pq->lock_name), "%s-%d", device_get_nameunit(dev), i); mtx_init(&pq->lock, pq->lock_name, NULL, MTX_DEF); if (i >= num_tx_rings) { /* RX queue: fix kring_id. */ pq->kring_id -= num_tx_rings; } else { /* TX queue: allocate buf_ring. */ pq->bufring = buf_ring_alloc(PTNET_BUF_RING_SIZE, M_DEVBUF, M_NOWAIT, &pq->lock); if (pq->bufring == NULL) { err = ENOMEM; goto err_path; } } } sc->min_tx_space = 64; /* Safe initial value. */ err = ptnet_irqs_init(sc); if (err) { goto err_path; } /* Setup Ethernet interface. */ sc->ifp = ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Failed to allocate ifnet\n"); err = ENOMEM; goto err_path; } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_baudrate = IF_Gbps(10); ifp->if_softc = sc; ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; ifp->if_init = ptnet_init; ifp->if_ioctl = ptnet_ioctl; #if __FreeBSD_version >= 1100000 ifp->if_get_counter = ptnet_get_counter; #endif ifp->if_transmit = ptnet_transmit; ifp->if_qflush = ptnet_qflush; ifmedia_init(&sc->media, IFM_IMASK, ptnet_media_change, ptnet_media_status); ifmedia_add(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL); ifmedia_set(&sc->media, IFM_ETHER | IFM_10G_T | IFM_FDX); macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_HI); sc->hwaddr[0] = (macreg >> 8) & 0xff; sc->hwaddr[1] = macreg & 0xff; macreg = bus_read_4(sc->iomem, PTNET_IO_MAC_LO); sc->hwaddr[2] = (macreg >> 24) & 0xff; sc->hwaddr[3] = (macreg >> 16) & 0xff; sc->hwaddr[4] = (macreg >> 8) & 0xff; sc->hwaddr[5] = macreg & 0xff; ether_ifattach(ifp, sc->hwaddr); ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; if (sc->ptfeatures & PTNETMAP_F_VNET_HDR) { /* Similarly to what the vtnet driver does, we can emulate * VLAN offloadings by inserting and removing the 802.1Q * header during transmit and receive. We are then able * to do checksum offloading of VLAN frames. */ ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 | IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWTAGGING; } ifp->if_capenable = ifp->if_capabilities; #ifdef DEVICE_POLLING /* Don't enable polling by default. */ ifp->if_capabilities |= IFCAP_POLLING; #endif snprintf(sc->lock_name, sizeof(sc->lock_name), "%s", device_get_nameunit(dev)); mtx_init(&sc->lock, sc->lock_name, "ptnet core lock", MTX_DEF); callout_init_mtx(&sc->tick, &sc->lock, 0); /* Prepare a netmap_adapter struct instance to do netmap_attach(). */ nifp_offset = bus_read_4(sc->iomem, PTNET_IO_NIFP_OFS); memset(&na_arg, 0, sizeof(na_arg)); na_arg.ifp = ifp; na_arg.num_tx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS); na_arg.num_rx_desc = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS); na_arg.num_tx_rings = num_tx_rings; na_arg.num_rx_rings = num_rx_rings; na_arg.nm_config = ptnet_nm_config; na_arg.nm_krings_create = ptnet_nm_krings_create; na_arg.nm_krings_delete = ptnet_nm_krings_delete; na_arg.nm_dtor = ptnet_nm_dtor; na_arg.nm_register = ptnet_nm_register; na_arg.nm_txsync = ptnet_nm_txsync; na_arg.nm_rxsync = ptnet_nm_rxsync; - netmap_pt_guest_attach(&na_arg, sc->csb, nifp_offset, ptnet_nm_ptctl); + netmap_pt_guest_attach(&na_arg, sc->csb, nifp_offset, + bus_read_4(sc->iomem, PTNET_IO_HOSTMEMID)); /* Now a netmap adapter for this ifp has been allocated, and it * can be accessed through NA(ifp). We also have to initialize the CSB * pointer. */ sc->ptna = (struct netmap_pt_guest_adapter *)NA(ifp); /* If virtio-net header was negotiated, set the virt_hdr_len field in * the netmap adapter, to inform users that this netmap adapter requires * the application to deal with the headers. */ ptnet_update_vnet_hdr(sc); device_printf(dev, "%s() completed\n", __func__); return (0); err_path: ptnet_detach(dev); return err; } static int ptnet_detach(device_t dev) { struct ptnet_softc *sc = device_get_softc(dev); int i; #ifdef DEVICE_POLLING if (sc->ifp->if_capenable & IFCAP_POLLING) { ether_poll_deregister(sc->ifp); } #endif callout_drain(&sc->tick); if (sc->queues) { /* Drain taskqueues before calling if_detach. */ for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (pq->taskq) { taskqueue_drain(pq->taskq, &pq->task); } } } if (sc->ifp) { ether_ifdetach(sc->ifp); /* Uninitialize netmap adapters for this device. */ netmap_detach(sc->ifp); ifmedia_removeall(&sc->media); if_free(sc->ifp); sc->ifp = NULL; } ptnet_irqs_fini(sc); if (sc->csb) { bus_write_4(sc->iomem, PTNET_IO_CSBBAH, 0); bus_write_4(sc->iomem, PTNET_IO_CSBBAL, 0); free(sc->csb, M_DEVBUF); sc->csb = NULL; } if (sc->queues) { for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (mtx_initialized(&pq->lock)) { mtx_destroy(&pq->lock); } if (pq->bufring != NULL) { buf_ring_free(pq->bufring, M_DEVBUF); } } free(sc->queues, M_DEVBUF); sc->queues = NULL; } if (sc->iomem) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(PTNETMAP_IO_PCI_BAR), sc->iomem); sc->iomem = NULL; } mtx_destroy(&sc->lock); device_printf(dev, "%s() completed\n", __func__); return (0); } static int ptnet_suspend(device_t dev) { struct ptnet_softc *sc; sc = device_get_softc(dev); (void)sc; return (0); } static int ptnet_resume(device_t dev) { struct ptnet_softc *sc; sc = device_get_softc(dev); (void)sc; return (0); } static int ptnet_shutdown(device_t dev) { /* * Suspend already does all of what we need to * do here; we just never expect to be resumed. */ return (ptnet_suspend(dev)); } static int ptnet_irqs_init(struct ptnet_softc *sc) { int rid = PCIR_BAR(PTNETMAP_MSIX_PCI_BAR); int nvecs = sc->num_rings; device_t dev = sc->dev; int err = ENOSPC; int cpu_cur; int i; if (pci_find_cap(dev, PCIY_MSIX, NULL) != 0) { device_printf(dev, "Could not find MSI-X capability\n"); return (ENXIO); } sc->msix_mem = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->msix_mem == NULL) { device_printf(dev, "Failed to allocate MSIX PCI BAR\n"); return (ENXIO); } if (pci_msix_count(dev) < nvecs) { device_printf(dev, "Not enough MSI-X vectors\n"); goto err_path; } err = pci_alloc_msix(dev, &nvecs); if (err) { device_printf(dev, "Failed to allocate MSI-X vectors\n"); goto err_path; } for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; rid = i + 1; pq->irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (pq->irq == NULL) { device_printf(dev, "Failed to allocate interrupt " "for queue #%d\n", i); err = ENOSPC; goto err_path; } } cpu_cur = CPU_FIRST(); for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; void (*handler)(void *) = ptnet_tx_intr; if (i >= sc->num_tx_rings) { handler = ptnet_rx_intr; } err = bus_setup_intr(dev, pq->irq, INTR_TYPE_NET | INTR_MPSAFE, NULL /* intr_filter */, handler, pq, &pq->cookie); if (err) { device_printf(dev, "Failed to register intr handler " "for queue #%d\n", i); goto err_path; } bus_describe_intr(dev, pq->irq, pq->cookie, "q%d", i); #if 0 bus_bind_intr(sc->dev, pq->irq, cpu_cur); #endif cpu_cur = CPU_NEXT(cpu_cur); } device_printf(dev, "Allocated %d MSI-X vectors\n", nvecs); cpu_cur = CPU_FIRST(); for (i = 0; i < nvecs; i++) { struct ptnet_queue *pq = sc->queues + i; static void (*handler)(void *context, int pending); handler = (i < sc->num_tx_rings) ? ptnet_tx_task : ptnet_rx_task; TASK_INIT(&pq->task, 0, handler, pq); pq->taskq = taskqueue_create_fast("ptnet_queue", M_NOWAIT, taskqueue_thread_enqueue, &pq->taskq); taskqueue_start_threads(&pq->taskq, 1, PI_NET, "%s-pq-%d", device_get_nameunit(sc->dev), cpu_cur); cpu_cur = CPU_NEXT(cpu_cur); } return 0; err_path: ptnet_irqs_fini(sc); return err; } static void ptnet_irqs_fini(struct ptnet_softc *sc) { device_t dev = sc->dev; int i; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (pq->taskq) { taskqueue_free(pq->taskq); pq->taskq = NULL; } if (pq->cookie) { bus_teardown_intr(dev, pq->irq, pq->cookie); pq->cookie = NULL; } if (pq->irq) { bus_release_resource(dev, SYS_RES_IRQ, i + 1, pq->irq); pq->irq = NULL; } } if (sc->msix_mem) { pci_release_msi(dev); bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MSIX_PCI_BAR), sc->msix_mem); sc->msix_mem = NULL; } } static void ptnet_init(void *opaque) { struct ptnet_softc *sc = opaque; PTNET_CORE_LOCK(sc); ptnet_init_locked(sc); PTNET_CORE_UNLOCK(sc); } static int ptnet_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct ptnet_softc *sc = if_getsoftc(ifp); device_t dev = sc->dev; struct ifreq *ifr = (struct ifreq *)data; int mask, err = 0; switch (cmd) { case SIOCSIFFLAGS: device_printf(dev, "SIOCSIFFLAGS %x\n", ifp->if_flags); PTNET_CORE_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* Network stack wants the iff to be up. */ err = ptnet_init_locked(sc); } else { /* Network stack wants the iff to be down. */ err = ptnet_stop(sc); } /* We don't need to do nothing to support IFF_PROMISC, * since that is managed by the backend port. */ PTNET_CORE_UNLOCK(sc); break; case SIOCSIFCAP: device_printf(dev, "SIOCSIFCAP %x %x\n", ifr->ifr_reqcap, ifp->if_capenable); mask = ifr->ifr_reqcap ^ ifp->if_capenable; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { struct ptnet_queue *pq; int i; if (ifr->ifr_reqcap & IFCAP_POLLING) { err = ether_poll_register(ptnet_poll, ifp); if (err) { break; } /* Stop queues and sync with taskqueues. */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; for (i = 0; i < sc->num_rings; i++) { pq = sc-> queues + i; /* Make sure the worker sees the * IFF_DRV_RUNNING down. */ PTNET_Q_LOCK(pq); pq->ptring->guest_need_kick = 0; PTNET_Q_UNLOCK(pq); /* Wait for rescheduling to finish. */ if (pq->taskq) { taskqueue_drain(pq->taskq, &pq->task); } } ifp->if_drv_flags |= IFF_DRV_RUNNING; } else { err = ether_poll_deregister(ifp); for (i = 0; i < sc->num_rings; i++) { pq = sc-> queues + i; PTNET_Q_LOCK(pq); pq->ptring->guest_need_kick = 1; PTNET_Q_UNLOCK(pq); } } } #endif /* DEVICE_POLLING */ ifp->if_capenable = ifr->ifr_reqcap; break; case SIOCSIFMTU: /* We support any reasonable MTU. */ if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > PTNET_MAX_PKT_SIZE) { err = EINVAL; } else { PTNET_CORE_LOCK(sc); ifp->if_mtu = ifr->ifr_mtu; PTNET_CORE_UNLOCK(sc); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: err = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); break; default: err = ether_ioctl(ifp, cmd, data); break; } return err; } static int ptnet_init_locked(struct ptnet_softc *sc) { if_t ifp = sc->ifp; struct netmap_adapter *na_dr = &sc->ptna->dr.up; struct netmap_adapter *na_nm = &sc->ptna->hwup.up; unsigned int nm_buf_size; int ret; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { return 0; /* nothing to do */ } device_printf(sc->dev, "%s\n", __func__); /* Translate offload capabilities according to if_capenable. */ ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= PTNET_CSUM_OFFLOAD; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= PTNET_CSUM_OFFLOAD_IPV6; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; /* * Prepare the interface for netmap mode access. */ netmap_update_config(na_dr); ret = netmap_mem_finalize(na_dr->nm_mem, na_dr); if (ret) { device_printf(sc->dev, "netmap_mem_finalize() failed\n"); return ret; } if (sc->ptna->backend_regifs == 0) { ret = ptnet_nm_krings_create(na_nm); if (ret) { device_printf(sc->dev, "ptnet_nm_krings_create() " "failed\n"); goto err_mem_finalize; } ret = netmap_mem_rings_create(na_dr); if (ret) { device_printf(sc->dev, "netmap_mem_rings_create() " "failed\n"); goto err_rings_create; } ret = netmap_mem_get_lut(na_dr->nm_mem, &na_dr->na_lut); if (ret) { device_printf(sc->dev, "netmap_mem_get_lut() " "failed\n"); goto err_get_lut; } } ret = ptnet_nm_register(na_dr, 1 /* on */); if (ret) { goto err_register; } nm_buf_size = NETMAP_BUF_SIZE(na_dr); KASSERT(nm_buf_size > 0, ("Invalid netmap buffer size")); sc->min_tx_space = PTNET_MAX_PKT_SIZE / nm_buf_size + 2; device_printf(sc->dev, "%s: min_tx_space = %u\n", __func__, sc->min_tx_space); #ifdef PTNETMAP_STATS callout_reset(&sc->tick, hz, ptnet_tick, sc); #endif ifp->if_drv_flags |= IFF_DRV_RUNNING; return 0; err_register: memset(&na_dr->na_lut, 0, sizeof(na_dr->na_lut)); err_get_lut: netmap_mem_rings_delete(na_dr); err_rings_create: ptnet_nm_krings_delete(na_nm); err_mem_finalize: netmap_mem_deref(na_dr->nm_mem, na_dr); return ret; } /* To be called under core lock. */ static int ptnet_stop(struct ptnet_softc *sc) { if_t ifp = sc->ifp; struct netmap_adapter *na_dr = &sc->ptna->dr.up; struct netmap_adapter *na_nm = &sc->ptna->hwup.up; int i; device_printf(sc->dev, "%s\n", __func__); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { return 0; /* nothing to do */ } /* Clear the driver-ready flag, and synchronize with all the queues, * so that after this loop we are sure nobody is working anymore with * the device. This scheme is taken from the vtnet driver. */ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->tick); for (i = 0; i < sc->num_rings; i++) { PTNET_Q_LOCK(sc->queues + i); PTNET_Q_UNLOCK(sc->queues + i); } ptnet_nm_register(na_dr, 0 /* off */); if (sc->ptna->backend_regifs == 0) { netmap_mem_rings_delete(na_dr); ptnet_nm_krings_delete(na_nm); } netmap_mem_deref(na_dr->nm_mem, na_dr); return 0; } static void ptnet_qflush(if_t ifp) { struct ptnet_softc *sc = if_getsoftc(ifp); int i; /* Flush all the bufrings and do the interface flush. */ for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; struct mbuf *m; PTNET_Q_LOCK(pq); if (pq->bufring) { while ((m = buf_ring_dequeue_sc(pq->bufring))) { m_freem(m); } } PTNET_Q_UNLOCK(pq); } if_qflush(ifp); } static int ptnet_media_change(if_t ifp) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ifmedia *ifm = &sc->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) { return EINVAL; } return 0; } #if __FreeBSD_version >= 1100000 static uint64_t ptnet_get_counter(if_t ifp, ift_counter cnt) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ptnet_queue_stats stats[2]; int i; /* Accumulate statistics over the queues. */ memset(stats, 0, sizeof(stats)); for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; int idx = (i < sc->num_tx_rings) ? 0 : 1; stats[idx].packets += pq->stats.packets; stats[idx].bytes += pq->stats.bytes; stats[idx].errors += pq->stats.errors; stats[idx].iqdrops += pq->stats.iqdrops; stats[idx].mcasts += pq->stats.mcasts; } switch (cnt) { case IFCOUNTER_IPACKETS: return (stats[1].packets); case IFCOUNTER_IQDROPS: return (stats[1].iqdrops); case IFCOUNTER_IERRORS: return (stats[1].errors); case IFCOUNTER_OPACKETS: return (stats[0].packets); case IFCOUNTER_OBYTES: return (stats[0].bytes); case IFCOUNTER_OMCASTS: return (stats[0].mcasts); default: return (if_get_counter_default(ifp, cnt)); } } #endif #ifdef PTNETMAP_STATS /* Called under core lock. */ static void ptnet_tick(void *opaque) { struct ptnet_softc *sc = opaque; int i; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; struct ptnet_queue_stats cur = pq->stats; struct timeval now; unsigned int delta; microtime(&now); delta = now.tv_usec - sc->last_ts.tv_usec + (now.tv_sec - sc->last_ts.tv_sec) * 1000000; delta /= 1000; /* in milliseconds */ if (delta == 0) continue; device_printf(sc->dev, "#%d[%u ms]:pkts %lu, kicks %lu, " "intr %lu\n", i, delta, (cur.packets - pq->last_stats.packets), (cur.kicks - pq->last_stats.kicks), (cur.intrs - pq->last_stats.intrs)); pq->last_stats = cur; } microtime(&sc->last_ts); callout_schedule(&sc->tick, hz); } #endif /* PTNETMAP_STATS */ static void ptnet_media_status(if_t ifp, struct ifmediareq *ifmr) { /* We are always active, as the backend netmap port is * always open in netmap mode. */ ifmr->ifm_status = IFM_AVALID | IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } static uint32_t ptnet_nm_ptctl(if_t ifp, uint32_t cmd) { struct ptnet_softc *sc = if_getsoftc(ifp); - int ret; - + /* + * Write a command and read back error status, + * with zero meaning success. + */ bus_write_4(sc->iomem, PTNET_IO_PTCTL, cmd); - ret = bus_read_4(sc->iomem, PTNET_IO_PTSTS); - device_printf(sc->dev, "PTCTL %u, ret %u\n", cmd, ret); - - return ret; + return bus_read_4(sc->iomem, PTNET_IO_PTCTL); } static int ptnet_nm_config(struct netmap_adapter *na, unsigned *txr, unsigned *txd, unsigned *rxr, unsigned *rxd) { struct ptnet_softc *sc = if_getsoftc(na->ifp); *txr = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_RINGS); *rxr = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_RINGS); *txd = bus_read_4(sc->iomem, PTNET_IO_NUM_TX_SLOTS); *rxd = bus_read_4(sc->iomem, PTNET_IO_NUM_RX_SLOTS); device_printf(sc->dev, "txr %u, rxr %u, txd %u, rxd %u\n", *txr, *rxr, *txd, *rxd); return 0; } static void ptnet_sync_from_csb(struct ptnet_softc *sc, struct netmap_adapter *na) { int i; /* Sync krings from the host, reading from * CSB. */ for (i = 0; i < sc->num_rings; i++) { struct ptnet_ring *ptring = sc->queues[i].ptring; struct netmap_kring *kring; if (i < na->num_tx_rings) { kring = na->tx_rings + i; } else { kring = na->rx_rings + i - na->num_tx_rings; } kring->rhead = kring->ring->head = ptring->head; kring->rcur = kring->ring->cur = ptring->cur; kring->nr_hwcur = ptring->hwcur; kring->nr_hwtail = kring->rtail = kring->ring->tail = ptring->hwtail; ND("%d,%d: csb {hc %u h %u c %u ht %u}", t, i, ptring->hwcur, ptring->head, ptring->cur, ptring->hwtail); ND("%d,%d: kring {hc %u rh %u rc %u h %u c %u ht %u rt %u t %u}", t, i, kring->nr_hwcur, kring->rhead, kring->rcur, kring->ring->head, kring->ring->cur, kring->nr_hwtail, kring->rtail, kring->ring->tail); } } static void ptnet_update_vnet_hdr(struct ptnet_softc *sc) { unsigned int wanted_hdr_len = ptnet_vnet_hdr ? PTNET_HDR_SIZE : 0; bus_write_4(sc->iomem, PTNET_IO_VNET_HDR_LEN, wanted_hdr_len); sc->vnet_hdr_len = bus_read_4(sc->iomem, PTNET_IO_VNET_HDR_LEN); sc->ptna->hwup.up.virt_hdr_len = sc->vnet_hdr_len; } static int ptnet_nm_register(struct netmap_adapter *na, int onoff) { /* device-specific */ if_t ifp = na->ifp; struct ptnet_softc *sc = if_getsoftc(ifp); int native = (na == &sc->ptna->hwup.up); struct ptnet_queue *pq; enum txrx t; int ret = 0; int i; if (!onoff) { sc->ptna->backend_regifs--; } /* If this is the last netmap client, guest interrupt enable flags may * be in arbitrary state. Since these flags are going to be used also * by the netdevice driver, we have to make sure to start with * notifications enabled. Also, schedule NAPI to flush pending packets * in the RX rings, since we will not receive further interrupts * until these will be processed. */ if (native && !onoff && na->active_fds == 0) { D("Exit netmap mode, re-enable interrupts"); for (i = 0; i < sc->num_rings; i++) { pq = sc->queues + i; pq->ptring->guest_need_kick = 1; } } if (onoff) { if (sc->ptna->backend_regifs == 0) { /* Initialize notification enable fields in the CSB. */ for (i = 0; i < sc->num_rings; i++) { pq = sc->queues + i; pq->ptring->host_need_kick = 1; pq->ptring->guest_need_kick = (!(ifp->if_capenable & IFCAP_POLLING) && i >= sc->num_tx_rings); } /* Set the virtio-net header length. */ ptnet_update_vnet_hdr(sc); /* Make sure the host adapter passed through is ready * for txsync/rxsync. */ - ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_REGIF); + ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_CREATE); if (ret) { return ret; } } /* Sync from CSB must be done after REGIF PTCTL. Skip this * step only if this is a netmap client and it is not the * first one. */ if ((!native && sc->ptna->backend_regifs == 0) || (native && na->active_fds == 0)) { ptnet_sync_from_csb(sc, na); } /* If not native, don't call nm_set_native_flags, since we don't want * to replace if_transmit method, nor set NAF_NETMAP_ON */ if (native) { for_rx_tx(t) { for (i = 0; i <= nma_get_nrings(na, t); i++) { struct netmap_kring *kring = &NMR(na, t)[i]; if (nm_kring_pending_on(kring)) { kring->nr_mode = NKR_NETMAP_ON; } } } nm_set_native_flags(na); } } else { if (native) { nm_clear_native_flags(na); for_rx_tx(t) { for (i = 0; i <= nma_get_nrings(na, t); i++) { struct netmap_kring *kring = &NMR(na, t)[i]; if (nm_kring_pending_off(kring)) { kring->nr_mode = NKR_NETMAP_OFF; } } } } /* Sync from CSB must be done before UNREGIF PTCTL, on the last * netmap client. */ if (native && na->active_fds == 0) { ptnet_sync_from_csb(sc, na); } if (sc->ptna->backend_regifs == 0) { - ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_UNREGIF); + ret = ptnet_nm_ptctl(ifp, PTNETMAP_PTCTL_DELETE); } } if (onoff) { sc->ptna->backend_regifs++; } return ret; } static int ptnet_nm_txsync(struct netmap_kring *kring, int flags) { struct ptnet_softc *sc = if_getsoftc(kring->na->ifp); struct ptnet_queue *pq = sc->queues + kring->ring_id; bool notify; notify = netmap_pt_guest_txsync(pq->ptring, kring, flags); if (notify) { ptnet_kick(pq); } return 0; } static int ptnet_nm_rxsync(struct netmap_kring *kring, int flags) { struct ptnet_softc *sc = if_getsoftc(kring->na->ifp); struct ptnet_queue *pq = sc->rxqueues + kring->ring_id; bool notify; notify = netmap_pt_guest_rxsync(pq->ptring, kring, flags); if (notify) { ptnet_kick(pq); } return 0; } static void ptnet_tx_intr(void *opaque) { struct ptnet_queue *pq = opaque; struct ptnet_softc *sc = pq->sc; DBG(device_printf(sc->dev, "Tx interrupt #%d\n", pq->kring_id)); #ifdef PTNETMAP_STATS pq->stats.intrs ++; #endif /* PTNETMAP_STATS */ if (netmap_tx_irq(sc->ifp, pq->kring_id) != NM_IRQ_PASS) { return; } /* Schedule the tasqueue to flush process transmissions requests. * However, vtnet, if_em and if_igb just call ptnet_transmit() here, * at least when using MSI-X interrupts. The if_em driver, instead * schedule taskqueue when using legacy interrupts. */ taskqueue_enqueue(pq->taskq, &pq->task); } static void ptnet_rx_intr(void *opaque) { struct ptnet_queue *pq = opaque; struct ptnet_softc *sc = pq->sc; unsigned int unused; DBG(device_printf(sc->dev, "Rx interrupt #%d\n", pq->kring_id)); #ifdef PTNETMAP_STATS pq->stats.intrs ++; #endif /* PTNETMAP_STATS */ if (netmap_rx_irq(sc->ifp, pq->kring_id, &unused) != NM_IRQ_PASS) { return; } /* Like vtnet, if_igb and if_em drivers when using MSI-X interrupts, * receive-side processing is executed directly in the interrupt * service routine. Alternatively, we may schedule the taskqueue. */ ptnet_rx_eof(pq, PTNET_RX_BUDGET, true); } /* The following offloadings-related functions are taken from the vtnet * driver, but the same functionality is required for the ptnet driver. * As a temporary solution, I copied this code from vtnet and I started * to generalize it (taking away driver-specific statistic accounting), * making as little modifications as possible. * In the future we need to share these functions between vtnet and ptnet. */ static int ptnet_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start) { struct ether_vlan_header *evh; int offset; evh = mtod(m, struct ether_vlan_header *); if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { /* BMV: We should handle nested VLAN tags too. */ *etype = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else { *etype = ntohs(evh->evl_encap_proto); offset = sizeof(struct ether_header); } switch (*etype) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip, iphdr; if (__predict_false(m->m_len < offset + sizeof(struct ip))) { m_copydata(m, offset, sizeof(struct ip), (caddr_t) &iphdr); ip = &iphdr; } else ip = (struct ip *)(m->m_data + offset); *proto = ip->ip_p; *start = offset + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: *proto = -1; *start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto); /* Assert the network stack sent us a valid packet. */ KASSERT(*start > offset, ("%s: mbuf %p start %d offset %d proto %d", __func__, m, *start, offset, *proto)); break; #endif default: /* Here we should increment the tx_csum_bad_ethtype counter. */ return (EINVAL); } return (0); } static int ptnet_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type, int offset, bool allow_ecn, struct virtio_net_hdr *hdr) { static struct timeval lastecn; static int curecn; struct tcphdr *tcp, tcphdr; if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) { m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr); tcp = &tcphdr; } else tcp = (struct tcphdr *)(m->m_data + offset); hdr->hdr_len = offset + (tcp->th_off << 2); hdr->gso_size = m->m_pkthdr.tso_segsz; hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 : VIRTIO_NET_HDR_GSO_TCPV6; if (tcp->th_flags & TH_CWR) { /* * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD, * ECN support is not on a per-interface basis, but globally via * the net.inet.tcp.ecn.enable sysctl knob. The default is off. */ if (!allow_ecn) { if (ppsratecheck(&lastecn, &curecn, 1)) if_printf(ifp, "TSO with ECN not negotiated with host\n"); return (ENOTSUP); } hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } /* Here we should increment tx_tso counter. */ return (0); } static struct mbuf * ptnet_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn, struct virtio_net_hdr *hdr) { int flags, etype, csum_start, proto, error; flags = m->m_pkthdr.csum_flags; error = ptnet_tx_offload_ctx(m, &etype, &proto, &csum_start); if (error) goto drop; if ((etype == ETHERTYPE_IP && flags & PTNET_CSUM_OFFLOAD) || (etype == ETHERTYPE_IPV6 && flags & PTNET_CSUM_OFFLOAD_IPV6)) { /* * We could compare the IP protocol vs the CSUM_ flag too, * but that really should not be necessary. */ hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = csum_start; hdr->csum_offset = m->m_pkthdr.csum_data; /* Here we should increment the tx_csum counter. */ } if (flags & CSUM_TSO) { if (__predict_false(proto != IPPROTO_TCP)) { /* Likely failed to correctly parse the mbuf. * Here we should increment the tx_tso_not_tcp * counter. */ goto drop; } KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM, ("%s: mbuf %p TSO without checksum offload %#x", __func__, m, flags)); error = ptnet_tx_offload_tso(ifp, m, etype, csum_start, allow_ecn, hdr); if (error) goto drop; } return (m); drop: m_freem(m); return (NULL); } static void ptnet_vlan_tag_remove(struct mbuf *m) { struct ether_vlan_header *evh; evh = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag); m->m_flags |= M_VLANTAG; /* Strip the 802.1Q header. */ bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); } /* * Use the checksum offset in the VirtIO header to set the * correct CSUM_* flags. */ static int ptnet_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { #if defined(INET) || defined(INET6) int offset = hdr->csum_start + hdr->csum_offset; #endif /* Only do a basic sanity check on the offset. */ switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: if (__predict_false(offset < ip_start + sizeof(struct ip))) return (1); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr))) return (1); break; #endif default: /* Here we should increment the rx_csum_bad_ethtype counter. */ return (1); } /* * Use the offset to determine the appropriate CSUM_* flags. This is * a bit dirty, but we can get by with it since the checksum offsets * happen to be different. We assume the host host does not do IPv4 * header checksum offloading. */ switch (hdr->csum_offset) { case offsetof(struct udphdr, uh_sum): case offsetof(struct tcphdr, th_sum): m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case offsetof(struct sctphdr, checksum): m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; default: /* Here we should increment the rx_csum_bad_offset counter. */ return (1); } return (0); } static int ptnet_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr) { int offset, proto; switch (eth_type) { #if defined(INET) case ETHERTYPE_IP: { struct ip *ip; if (__predict_false(m->m_len < ip_start + sizeof(struct ip))) return (1); ip = (struct ip *)(m->m_data + ip_start); proto = ip->ip_p; offset = ip_start + (ip->ip_hl << 2); break; } #endif #if defined(INET6) case ETHERTYPE_IPV6: if (__predict_false(m->m_len < ip_start + sizeof(struct ip6_hdr))) return (1); offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto); if (__predict_false(offset < 0)) return (1); break; #endif default: /* Here we should increment the rx_csum_bad_ethtype counter. */ return (1); } switch (proto) { case IPPROTO_TCP: if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case IPPROTO_UDP: if (__predict_false(m->m_len < offset + sizeof(struct udphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; break; case IPPROTO_SCTP: if (__predict_false(m->m_len < offset + sizeof(struct sctphdr))) return (1); m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; break; default: /* * For the remaining protocols, FreeBSD does not support * checksum offloading, so the checksum will be recomputed. */ #if 0 if_printf(ifp, "cksum offload of unsupported " "protocol eth_type=%#x proto=%d csum_start=%d " "csum_offset=%d\n", __func__, eth_type, proto, hdr->csum_start, hdr->csum_offset); #endif break; } return (0); } /* * Set the appropriate CSUM_* flags. Unfortunately, the information * provided is not directly useful to us. The VirtIO header gives the * offset of the checksum, which is all Linux needs, but this is not * how FreeBSD does things. We are forced to peek inside the packet * a bit. * * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD * could accept the offsets and let the stack figure it out. */ static int ptnet_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr) { struct ether_header *eh; struct ether_vlan_header *evh; uint16_t eth_type; int offset, error; eh = mtod(m, struct ether_header *); eth_type = ntohs(eh->ether_type); if (eth_type == ETHERTYPE_VLAN) { /* BMV: We should handle nested VLAN tags too. */ evh = mtod(m, struct ether_vlan_header *); eth_type = ntohs(evh->evl_proto); offset = sizeof(struct ether_vlan_header); } else offset = sizeof(struct ether_header); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) error = ptnet_rx_csum_by_offset(m, eth_type, offset, hdr); else error = ptnet_rx_csum_by_parse(m, eth_type, offset, hdr); return (error); } /* End of offloading-related functions to be shared with vtnet. */ static inline void ptnet_sync_tail(struct ptnet_ring *ptring, struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; /* Update hwcur and hwtail as known by the host. */ ptnetmap_guest_read_kring_csb(ptring, kring); /* nm_sync_finalize */ ring->tail = kring->rtail = kring->nr_hwtail; } static void ptnet_ring_update(struct ptnet_queue *pq, struct netmap_kring *kring, unsigned int head, unsigned int sync_flags) { struct netmap_ring *ring = kring->ring; struct ptnet_ring *ptring = pq->ptring; /* Some packets have been pushed to the netmap ring. We have * to tell the host to process the new packets, updating cur * and head in the CSB. */ ring->head = ring->cur = head; /* Mimic nm_txsync_prologue/nm_rxsync_prologue. */ kring->rcur = kring->rhead = head; ptnetmap_guest_write_kring_csb(ptring, kring->rcur, kring->rhead); /* Kick the host if needed. */ if (NM_ACCESS_ONCE(ptring->host_need_kick)) { ptring->sync_flags = sync_flags; ptnet_kick(pq); } } #define PTNET_TX_NOSPACE(_h, _k, _min) \ ((((_h) < (_k)->rtail) ? 0 : (_k)->nkr_num_slots) + \ (_k)->rtail - (_h)) < (_min) /* This function may be called by the network stack, or by * by the taskqueue thread. */ static int ptnet_drain_transmit_queue(struct ptnet_queue *pq, unsigned int budget, bool may_resched) { struct ptnet_softc *sc = pq->sc; bool have_vnet_hdr = sc->vnet_hdr_len; struct netmap_adapter *na = &sc->ptna->dr.up; if_t ifp = sc->ifp; unsigned int batch_count = 0; struct ptnet_ring *ptring; struct netmap_kring *kring; struct netmap_ring *ring; struct netmap_slot *slot; unsigned int count = 0; unsigned int minspace; unsigned int head; unsigned int lim; struct mbuf *mhead; struct mbuf *mf; int nmbuf_bytes; uint8_t *nmbuf; if (!PTNET_Q_TRYLOCK(pq)) { /* We failed to acquire the lock, schedule the taskqueue. */ RD(1, "Deferring TX work"); if (may_resched) { taskqueue_enqueue(pq->taskq, &pq->task); } return 0; } if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { PTNET_Q_UNLOCK(pq); RD(1, "Interface is down"); return ENETDOWN; } ptring = pq->ptring; kring = na->tx_rings + pq->kring_id; ring = kring->ring; lim = kring->nkr_num_slots - 1; head = ring->head; minspace = sc->min_tx_space; while (count < budget) { if (PTNET_TX_NOSPACE(head, kring, minspace)) { /* We ran out of slot, let's see if the host has * freed up some, by reading hwcur and hwtail from * the CSB. */ ptnet_sync_tail(ptring, kring); if (PTNET_TX_NOSPACE(head, kring, minspace)) { /* Still no slots available. Reactivate the * interrupts so that we can be notified * when some free slots are made available by * the host. */ ptring->guest_need_kick = 1; /* Double-check. */ ptnet_sync_tail(ptring, kring); if (likely(PTNET_TX_NOSPACE(head, kring, minspace))) { break; } RD(1, "Found more slots by doublecheck"); /* More slots were freed before reactivating * the interrupts. */ ptring->guest_need_kick = 0; } } mhead = drbr_peek(ifp, pq->bufring); if (!mhead) { break; } /* Initialize transmission state variables. */ slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_bytes = 0; /* If needed, prepare the virtio-net header at the beginning * of the first slot. */ if (have_vnet_hdr) { struct virtio_net_hdr *vh = (struct virtio_net_hdr *)nmbuf; /* For performance, we could replace this memset() with * two 8-bytes-wide writes. */ memset(nmbuf, 0, PTNET_HDR_SIZE); if (mhead->m_pkthdr.csum_flags & PTNET_ALL_OFFLOAD) { mhead = ptnet_tx_offload(ifp, mhead, false, vh); if (unlikely(!mhead)) { /* Packet dropped because errors * occurred while preparing the vnet * header. Let's go ahead with the next * packet. */ pq->stats.errors ++; drbr_advance(ifp, pq->bufring); continue; } } ND(1, "%s: [csum_flags %lX] vnet hdr: flags %x " "csum_start %u csum_ofs %u hdr_len = %u " "gso_size %u gso_type %x", __func__, mhead->m_pkthdr.csum_flags, vh->flags, vh->csum_start, vh->csum_offset, vh->hdr_len, vh->gso_size, vh->gso_type); nmbuf += PTNET_HDR_SIZE; nmbuf_bytes += PTNET_HDR_SIZE; } for (mf = mhead; mf; mf = mf->m_next) { uint8_t *mdata = mf->m_data; int mlen = mf->m_len; for (;;) { int copy = NETMAP_BUF_SIZE(na) - nmbuf_bytes; if (mlen < copy) { copy = mlen; } memcpy(nmbuf, mdata, copy); mdata += copy; mlen -= copy; nmbuf += copy; nmbuf_bytes += copy; if (!mlen) { break; } slot->len = nmbuf_bytes; slot->flags = NS_MOREFRAG; head = nm_next(head, lim); KASSERT(head != ring->tail, ("Unexpectedly run out of TX space")); slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_bytes = 0; } } /* Complete last slot and update head. */ slot->len = nmbuf_bytes; slot->flags = 0; head = nm_next(head, lim); /* Consume the packet just processed. */ drbr_advance(ifp, pq->bufring); /* Copy the packet to listeners. */ ETHER_BPF_MTAP(ifp, mhead); pq->stats.packets ++; pq->stats.bytes += mhead->m_pkthdr.len; if (mhead->m_flags & M_MCAST) { pq->stats.mcasts ++; } m_freem(mhead); count ++; if (++batch_count == PTNET_TX_BATCH) { ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM); batch_count = 0; } } if (batch_count) { ptnet_ring_update(pq, kring, head, NAF_FORCE_RECLAIM); } if (count >= budget && may_resched) { DBG(RD(1, "out of budget: resched, %d mbufs pending\n", drbr_inuse(ifp, pq->bufring))); taskqueue_enqueue(pq->taskq, &pq->task); } PTNET_Q_UNLOCK(pq); return count; } static int ptnet_transmit(if_t ifp, struct mbuf *m) { struct ptnet_softc *sc = if_getsoftc(ifp); struct ptnet_queue *pq; unsigned int queue_idx; int err; DBG(device_printf(sc->dev, "transmit %p\n", m)); /* Insert 802.1Q header if needed. */ if (m->m_flags & M_VLANTAG) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { return ENOBUFS; } m->m_flags &= ~M_VLANTAG; } /* Get the flow-id if available. */ queue_idx = (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) ? m->m_pkthdr.flowid : curcpu; if (unlikely(queue_idx >= sc->num_tx_rings)) { queue_idx %= sc->num_tx_rings; } pq = sc->queues + queue_idx; err = drbr_enqueue(ifp, pq->bufring, m); if (err) { /* ENOBUFS when the bufring is full */ RD(1, "%s: drbr_enqueue() failed %d\n", __func__, err); pq->stats.errors ++; return err; } if (ifp->if_capenable & IFCAP_POLLING) { /* If polling is on, the transmit queues will be * drained by the poller. */ return 0; } err = ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true); return (err < 0) ? err : 0; } static unsigned int ptnet_rx_discard(struct netmap_kring *kring, unsigned int head) { struct netmap_ring *ring = kring->ring; struct netmap_slot *slot = ring->slot + head; for (;;) { head = nm_next(head, kring->nkr_num_slots - 1); if (!(slot->flags & NS_MOREFRAG) || head == ring->tail) { break; } slot = ring->slot + head; } return head; } static inline struct mbuf * ptnet_rx_slot(struct mbuf *mtail, uint8_t *nmbuf, unsigned int nmbuf_len) { uint8_t *mdata = mtod(mtail, uint8_t *) + mtail->m_len; do { unsigned int copy; if (mtail->m_len == MCLBYTES) { struct mbuf *mf; mf = m_getcl(M_NOWAIT, MT_DATA, 0); if (unlikely(!mf)) { return NULL; } mtail->m_next = mf; mtail = mf; mdata = mtod(mtail, uint8_t *); mtail->m_len = 0; } copy = MCLBYTES - mtail->m_len; if (nmbuf_len < copy) { copy = nmbuf_len; } memcpy(mdata, nmbuf, copy); nmbuf += copy; nmbuf_len -= copy; mdata += copy; mtail->m_len += copy; } while (nmbuf_len); return mtail; } static int ptnet_rx_eof(struct ptnet_queue *pq, unsigned int budget, bool may_resched) { struct ptnet_softc *sc = pq->sc; bool have_vnet_hdr = sc->vnet_hdr_len; struct ptnet_ring *ptring = pq->ptring; struct netmap_adapter *na = &sc->ptna->dr.up; struct netmap_kring *kring = na->rx_rings + pq->kring_id; struct netmap_ring *ring = kring->ring; unsigned int const lim = kring->nkr_num_slots - 1; unsigned int head = ring->head; unsigned int batch_count = 0; if_t ifp = sc->ifp; unsigned int count = 0; PTNET_Q_LOCK(pq); if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { goto unlock; } kring->nr_kflags &= ~NKR_PENDINTR; while (count < budget) { unsigned int prev_head = head; struct mbuf *mhead, *mtail; struct virtio_net_hdr *vh; struct netmap_slot *slot; unsigned int nmbuf_len; uint8_t *nmbuf; host_sync: if (head == ring->tail) { /* We ran out of slot, let's see if the host has * added some, by reading hwcur and hwtail from * the CSB. */ ptnet_sync_tail(ptring, kring); if (head == ring->tail) { /* Still no slots available. Reactivate * interrupts as they were disabled by the * host thread right before issuing the * last interrupt. */ ptring->guest_need_kick = 1; /* Double-check. */ ptnet_sync_tail(ptring, kring); if (likely(head == ring->tail)) { break; } ptring->guest_need_kick = 0; } } /* Initialize ring state variables, possibly grabbing the * virtio-net header. */ slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_len = slot->len; vh = (struct virtio_net_hdr *)nmbuf; if (have_vnet_hdr) { if (unlikely(nmbuf_len < PTNET_HDR_SIZE)) { /* There is no good reason why host should * put the header in multiple netmap slots. * If this is the case, discard. */ RD(1, "Fragmented vnet-hdr: dropping"); head = ptnet_rx_discard(kring, head); pq->stats.iqdrops ++; goto skip; } ND(1, "%s: vnet hdr: flags %x csum_start %u " "csum_ofs %u hdr_len = %u gso_size %u " "gso_type %x", __func__, vh->flags, vh->csum_start, vh->csum_offset, vh->hdr_len, vh->gso_size, vh->gso_type); nmbuf += PTNET_HDR_SIZE; nmbuf_len -= PTNET_HDR_SIZE; } /* Allocate the head of a new mbuf chain. * We use m_getcl() to allocate an mbuf with standard cluster * size (MCLBYTES). In the future we could use m_getjcl() * to choose different sizes. */ mhead = mtail = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (unlikely(mhead == NULL)) { device_printf(sc->dev, "%s: failed to allocate mbuf " "head\n", __func__); pq->stats.errors ++; break; } /* Initialize the mbuf state variables. */ mhead->m_pkthdr.len = nmbuf_len; mtail->m_len = 0; /* Scan all the netmap slots containing the current packet. */ for (;;) { DBG(device_printf(sc->dev, "%s: h %u t %u rcv frag " "len %u, flags %u\n", __func__, head, ring->tail, slot->len, slot->flags)); mtail = ptnet_rx_slot(mtail, nmbuf, nmbuf_len); if (unlikely(!mtail)) { /* Ouch. We ran out of memory while processing * a packet. We have to restore the previous * head position, free the mbuf chain, and * schedule the taskqueue to give the packet * another chance. */ device_printf(sc->dev, "%s: failed to allocate" " mbuf frag, reset head %u --> %u\n", __func__, head, prev_head); head = prev_head; m_freem(mhead); pq->stats.errors ++; if (may_resched) { taskqueue_enqueue(pq->taskq, &pq->task); } goto escape; } /* We have to increment head irrespective of the * NS_MOREFRAG being set or not. */ head = nm_next(head, lim); if (!(slot->flags & NS_MOREFRAG)) { break; } if (unlikely(head == ring->tail)) { /* The very last slot prepared by the host has * the NS_MOREFRAG set. Drop it and continue * the outer cycle (to do the double-check). */ RD(1, "Incomplete packet: dropping"); m_freem(mhead); pq->stats.iqdrops ++; goto host_sync; } slot = ring->slot + head; nmbuf = NMB(na, slot); nmbuf_len = slot->len; mhead->m_pkthdr.len += nmbuf_len; } mhead->m_pkthdr.rcvif = ifp; mhead->m_pkthdr.csum_flags = 0; /* Store the queue idx in the packet header. */ mhead->m_pkthdr.flowid = pq->kring_id; M_HASHTYPE_SET(mhead, M_HASHTYPE_OPAQUE); if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { struct ether_header *eh; eh = mtod(mhead, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { ptnet_vlan_tag_remove(mhead); /* * With the 802.1Q header removed, update the * checksum starting location accordingly. */ if (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) vh->csum_start -= ETHER_VLAN_ENCAP_LEN; } } if (have_vnet_hdr && (vh->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID))) { if (unlikely(ptnet_rx_csum(mhead, vh))) { m_freem(mhead); RD(1, "Csum offload error: dropping"); pq->stats.iqdrops ++; goto skip; } } pq->stats.packets ++; pq->stats.bytes += mhead->m_pkthdr.len; PTNET_Q_UNLOCK(pq); (*ifp->if_input)(ifp, mhead); PTNET_Q_LOCK(pq); if (unlikely(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) { /* The interface has gone down while we didn't * have the lock. Stop any processing and exit. */ goto unlock; } skip: count ++; if (++batch_count == PTNET_RX_BATCH) { /* Some packets have been pushed to the network stack. * We need to update the CSB to tell the host about the new * ring->cur and ring->head (RX buffer refill). */ ptnet_ring_update(pq, kring, head, NAF_FORCE_READ); batch_count = 0; } } escape: if (batch_count) { ptnet_ring_update(pq, kring, head, NAF_FORCE_READ); } if (count >= budget && may_resched) { /* If we ran out of budget or the double-check found new * slots to process, schedule the taskqueue. */ DBG(RD(1, "out of budget: resched h %u t %u\n", head, ring->tail)); taskqueue_enqueue(pq->taskq, &pq->task); } unlock: PTNET_Q_UNLOCK(pq); return count; } static void ptnet_rx_task(void *context, int pending) { struct ptnet_queue *pq = context; DBG(RD(1, "%s: pq #%u\n", __func__, pq->kring_id)); ptnet_rx_eof(pq, PTNET_RX_BUDGET, true); } static void ptnet_tx_task(void *context, int pending) { struct ptnet_queue *pq = context; DBG(RD(1, "%s: pq #%u\n", __func__, pq->kring_id)); ptnet_drain_transmit_queue(pq, PTNET_TX_BUDGET, true); } #ifdef DEVICE_POLLING /* We don't need to handle differently POLL_AND_CHECK_STATUS and * POLL_ONLY, since we don't have an Interrupt Status Register. */ static int ptnet_poll(if_t ifp, enum poll_cmd cmd, int budget) { struct ptnet_softc *sc = if_getsoftc(ifp); unsigned int queue_budget; unsigned int count = 0; bool borrow = false; int i; KASSERT(sc->num_rings > 0, ("Found no queues in while polling ptnet")); queue_budget = MAX(budget / sc->num_rings, 1); RD(1, "Per-queue budget is %d", queue_budget); while (budget) { unsigned int rcnt = 0; for (i = 0; i < sc->num_rings; i++) { struct ptnet_queue *pq = sc->queues + i; if (borrow) { queue_budget = MIN(queue_budget, budget); if (queue_budget == 0) { break; } } if (i < sc->num_tx_rings) { rcnt += ptnet_drain_transmit_queue(pq, queue_budget, false); } else { rcnt += ptnet_rx_eof(pq, queue_budget, false); } } if (!rcnt) { /* A scan of the queues gave no result, we can * stop here. */ break; } if (rcnt > budget) { /* This may happen when initial budget < sc->num_rings, * since one packet budget is given to each queue * anyway. Just pretend we didn't eat "so much". */ rcnt = budget; } count += rcnt; budget -= rcnt; borrow = true; } return count; } #endif /* DEVICE_POLLING */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap.c (revision 308054) @@ -1,3329 +1,3338 @@ /* * Copyright (C) 2011-2014 Matteo Landi * Copyright (C) 2011-2016 Luigi Rizzo * Copyright (C) 2011-2016 Giuseppe Lettieri * Copyright (C) 2011-2016 Vincenzo Maffione * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * This module supports memory mapped access to network devices, * see netmap(4). * * The module uses a large, memory pool allocated by the kernel * and accessible as mmapped memory by multiple userspace threads/processes. * The memory pool contains packet buffers and "netmap rings", * i.e. user-accessible copies of the interface's queues. * * Access to the network card works like this: * 1. a process/thread issues one or more open() on /dev/netmap, to create * select()able file descriptor on which events are reported. * 2. on each descriptor, the process issues an ioctl() to identify * the interface that should report events to the file descriptor. * 3. on each descriptor, the process issues an mmap() request to * map the shared memory region within the process' address space. * The list of interesting queues is indicated by a location in * the shared memory region. * 4. using the functions in the netmap(4) userspace API, a process * can look up the occupation state of a queue, access memory buffers, * and retrieve received packets or enqueue packets to transmit. * 5. using some ioctl()s the process can synchronize the userspace view * of the queue with the actual status in the kernel. This includes both * receiving the notification of new packets, and transmitting new * packets on the output interface. * 6. select() or poll() can be used to wait for events on individual * transmit or receive queues (or all queues for a given interface). * SYNCHRONIZATION (USER) The netmap rings and data structures may be shared among multiple user threads or even independent processes. Any synchronization among those threads/processes is delegated to the threads themselves. Only one thread at a time can be in a system call on the same netmap ring. The OS does not enforce this and only guarantees against system crashes in case of invalid usage. LOCKING (INTERNAL) Within the kernel, access to the netmap rings is protected as follows: - a spinlock on each ring, to handle producer/consumer races on RX rings attached to the host stack (against multiple host threads writing from the host stack to the same ring), and on 'destination' rings attached to a VALE switch (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) protecting multiple active senders for the same destination) - an atomic variable to guarantee that there is at most one instance of *_*xsync() on the ring at any time. For rings connected to user file descriptors, an atomic_test_and_set() protects this, and the lock on the ring is not actually used. For NIC RX rings connected to a VALE switch, an atomic_test_and_set() is also used to prevent multiple executions (the driver might indeed already guarantee this). For NIC TX rings connected to a VALE switch, the lock arbitrates access to the queue (both when allocating buffers and when pushing them out). - *xsync() should be protected against initializations of the card. On FreeBSD most devices have the reset routine protected by a RING lock (ixgbe, igb, em) or core lock (re). lem is missing the RING protection on rx_reset(), this should be added. On linux there is an external lock on the tx path, which probably also arbitrates access to the reset routine. XXX to be revised - a per-interface core_lock protecting access from the host stack while interfaces may be detached from netmap mode. XXX there should be no need for this lock if we detach the interfaces only while they are down. --- VALE SWITCH --- NMG_LOCK() serializes all modifications to switches and ports. A switch cannot be deleted until all ports are gone. For each switch, an SX lock (RWlock on linux) protects deletion of ports. When configuring or deleting a new port, the lock is acquired in exclusive mode (after holding NMG_LOCK). When forwarding, the lock is acquired in shared mode (without NMG_LOCK). The lock is held throughout the entire forwarding cycle, during which the thread may incur in a page fault. Hence it is important that sleepable shared locks are used. On the rx ring, the per-port lock is grabbed initially to reserve a number of slot in the ring, then the lock is released, packets are copied from source to destination, and then the lock is acquired again and the receive ring is updated. (A similar thing is done on the tx ring for NIC and host stack ports attached to the switch) */ /* --- internals ---- * * Roadmap to the code that implements the above. * * > 1. a process/thread issues one or more open() on /dev/netmap, to create * > select()able file descriptor on which events are reported. * * Internally, we allocate a netmap_priv_d structure, that will be * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d * structure for each open(). * * os-specific: * FreeBSD: see netmap_open() (netmap_freebsd.c) * linux: see linux_netmap_open() (netmap_linux.c) * * > 2. on each descriptor, the process issues an ioctl() to identify * > the interface that should report events to the file descriptor. * * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0. * Most important things happen in netmap_get_na() and * netmap_do_regif(), called from there. Additional details can be * found in the comments above those functions. * * In all cases, this action creates/takes-a-reference-to a * netmap_*_adapter describing the port, and allocates a netmap_if * and all necessary netmap rings, filling them with netmap buffers. * * In this phase, the sync callbacks for each ring are set (these are used * in steps 5 and 6 below). The callbacks depend on the type of adapter. * The adapter creation/initialization code puts them in the * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they * are copied from there to the netmap_kring's during netmap_do_regif(), by * the nm_krings_create() callback. All the nm_krings_create callbacks * actually call netmap_krings_create() to perform this and the other * common stuff. netmap_krings_create() also takes care of the host rings, * if needed, by setting their sync callbacks appropriately. * * Additional actions depend on the kind of netmap_adapter that has been * registered: * * - netmap_hw_adapter: [netmap.c] * This is a system netdev/ifp with native netmap support. * The ifp is detached from the host stack by redirecting: * - transmissions (from the network stack) to netmap_transmit() * - receive notifications to the nm_notify() callback for * this adapter. The callback is normally netmap_notify(), unless * the ifp is attached to a bridge using bwrap, in which case it * is netmap_bwrap_intr_notify(). * * - netmap_generic_adapter: [netmap_generic.c] * A system netdev/ifp without native netmap support. * * (the decision about native/non native support is taken in * netmap_get_hw_na(), called by netmap_get_na()) * * - netmap_vp_adapter [netmap_vale.c] * Returned by netmap_get_bdg_na(). * This is a persistent or ephemeral VALE port. Ephemeral ports * are created on the fly if they don't already exist, and are * always attached to a bridge. * Persistent VALE ports must must be created separately, and i * then attached like normal NICs. The NIOCREGIF we are examining * will find them only if they had previosly been created and * attached (see VALE_CTL below). * * - netmap_pipe_adapter [netmap_pipe.c] * Returned by netmap_get_pipe_na(). * Both pipe ends are created, if they didn't already exist. * * - netmap_monitor_adapter [netmap_monitor.c] * Returned by netmap_get_monitor_na(). * If successful, the nm_sync callbacks of the monitored adapter * will be intercepted by the returned monitor. * * - netmap_bwrap_adapter [netmap_vale.c] * Cannot be obtained in this way, see VALE_CTL below * * * os-specific: * linux: we first go through linux_netmap_ioctl() to * adapt the FreeBSD interface to the linux one. * * * > 3. on each descriptor, the process issues an mmap() request to * > map the shared memory region within the process' address space. * > The list of interesting queues is indicated by a location in * > the shared memory region. * * os-specific: * FreeBSD: netmap_mmap_single (netmap_freebsd.c). * linux: linux_netmap_mmap (netmap_linux.c). * * > 4. using the functions in the netmap(4) userspace API, a process * > can look up the occupation state of a queue, access memory buffers, * > and retrieve received packets or enqueue packets to transmit. * * these actions do not involve the kernel. * * > 5. using some ioctl()s the process can synchronize the userspace view * > of the queue with the actual status in the kernel. This includes both * > receiving the notification of new packets, and transmitting new * > packets on the output interface. * * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC * cases. They invoke the nm_sync callbacks on the netmap_kring * structures, as initialized in step 2 and maybe later modified * by a monitor. Monitors, however, will always call the original * callback before doing anything else. * * * > 6. select() or poll() can be used to wait for events on individual * > transmit or receive queues (or all queues for a given interface). * * Implemented in netmap_poll(). This will call the same nm_sync() * callbacks as in step 5 above. * * os-specific: * linux: we first go through linux_netmap_poll() to adapt * the FreeBSD interface to the linux one. * * * ---- VALE_CTL ----- * * VALE switches are controlled by issuing a NIOCREGIF with a non-null * nr_cmd in the nmreq structure. These subcommands are handled by * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF * subcommands, respectively. * * Any network interface known to the system (including a persistent VALE * port) can be attached to a VALE switch by issuing the * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports * look exactly like ephemeral VALE ports (as created in step 2 above). The * attachment of other interfaces, instead, requires the creation of a * netmap_bwrap_adapter. Moreover, the attached interface must be put in * netmap mode. This may require the creation of a netmap_generic_adapter if * we have no native support for the interface, or if generic adapters have * been forced by sysctl. * * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(), * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach() * callback. In the case of the bwrap, the callback creates the * netmap_bwrap_adapter. The initialization of the bwrap is then * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl() * callback (netmap_bwrap_bdg_ctl in netmap_vale.c). * A generic adapter for the wrapped ifp will be created if needed, when * netmap_get_bdg_na() calls netmap_get_hw_na(). * * * ---- DATAPATHS ----- * * -= SYSTEM DEVICE WITH NATIVE SUPPORT =- * * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach() * * - tx from netmap userspace: * concurrently: * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == DEVICE_netmap_txsync() * 2) device interrupt handler * na->nm_notify() == netmap_notify() * - rx from netmap userspace: * concurrently: * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context * kring->nm_sync() == DEVICE_netmap_rxsync() * 2) device interrupt handler * na->nm_notify() == netmap_notify() * - rx from host stack * concurrently: * 1) host stack * netmap_transmit() * na->nm_notify == netmap_notify() * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context * kring->nm_sync() == netmap_rxsync_from_host * netmap_rxsync_from_host(na, NULL, NULL) * - tx to host stack * ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == netmap_txsync_to_host * netmap_txsync_to_host(na) * nm_os_send_up() * FreeBSD: na->if_input() == ether_input() * linux: netif_rx() with NM_MAGIC_PRIORITY_RX * * * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- * * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach() * * - tx from netmap userspace: * concurrently: * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == generic_netmap_txsync() * nm_os_generic_xmit_frame() * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX * ifp->ndo_start_xmit == generic_ndo_start_xmit() * gna->save_start_xmit == orig. dev. start_xmit * FreeBSD: na->if_transmit() == orig. dev if_transmit * 2) generic_mbuf_destructor() * na->nm_notify() == netmap_notify() * - rx from netmap userspace: * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context * kring->nm_sync() == generic_netmap_rxsync() * mbq_safe_dequeue() * 2) device driver * generic_rx_handler() * mbq_safe_enqueue() * na->nm_notify() == netmap_notify() * - rx from host stack * FreeBSD: same as native * Linux: same as native except: * 1) host stack * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX * ifp->ndo_start_xmit == generic_ndo_start_xmit() * netmap_transmit() * na->nm_notify() == netmap_notify() * - tx to host stack (same as native): * * * -= VALE =- * * INCOMING: * * - VALE ports: * ioctl(NIOCTXSYNC)/netmap_poll() in process context * kring->nm_sync() == netmap_vp_txsync() * * - system device with native support: * from cable: * interrupt * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring) * kring->nm_sync() == DEVICE_netmap_rxsync() * netmap_vp_txsync() * kring->nm_sync() == DEVICE_netmap_rxsync() * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * - system device with generic support: * from device driver: * generic_rx_handler() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring) * kring->nm_sync() == generic_netmap_rxsync() * netmap_vp_txsync() * kring->nm_sync() == generic_netmap_rxsync() * from host stack: * netmap_transmit() * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring) * kring->nm_sync() == netmap_rxsync_from_host() * netmap_vp_txsync() * * (all cases) --> nm_bdg_flush() * dest_na->nm_notify() == (see below) * * OUTGOING: * * - VALE ports: * concurrently: * 1) ioctlNIOCRXSYNC)/netmap_poll() in process context * kring->nm_sync() == netmap_vp_rxsync() * 2) from nm_bdg_flush() * na->nm_notify() == netmap_notify() * * - system device with native support: * to cable: * na->nm_notify() == netmap_bwrap_notify() * netmap_vp_rxsync() * kring->nm_sync() == DEVICE_netmap_txsync() * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync_locked() * * - system device with generic adapter: * to device driver: * na->nm_notify() == netmap_bwrap_notify() * netmap_vp_rxsync() * kring->nm_sync() == generic_netmap_txsync() * netmap_vp_rxsync() * to host stack: * netmap_vp_rxsync() * kring->nm_sync() == netmap_txsync_to_host * netmap_vp_rxsync() * */ /* * OS-specific code that is used only within this file. * Other OS-specific code that must be accessed by drivers * is present in netmap_kern.h */ #if defined(__FreeBSD__) #include /* prerequisite */ #include #include #include /* defines used in kernel.h */ #include /* types used in module initialization */ #include /* cdevsw struct, UID, GID */ #include /* FIONBIO */ #include #include /* struct socket */ #include #include #include #include /* sockaddrs */ #include #include #include #include #include #include #include /* BIOCIMMEDIATE */ #include /* bus_dmamap_* */ #include #include #elif defined(linux) #include "bsd_glue.h" #elif defined(__APPLE__) #warning OSX support is only partial #include "osx_glue.h" #elif defined (_WIN32) #include "win_glue.h" #else #error Unsupported platform #endif /* unsupported */ /* * common headers */ #include #include #include /* user-controlled variables */ int netmap_verbose; static int netmap_no_timestamp; /* don't timestamp on rxsync */ int netmap_mitigate = 1; int netmap_no_pendintr = 1; int netmap_txsync_retry = 2; int netmap_flags = 0; /* debug flags */ static int netmap_fwd = 0; /* force transparent mode */ /* * netmap_admode selects the netmap mode to use. * Invalid values are reset to NETMAP_ADMODE_BEST */ enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ NETMAP_ADMODE_NATIVE, /* either native or none */ NETMAP_ADMODE_GENERIC, /* force generic */ NETMAP_ADMODE_LAST }; static int netmap_admode = NETMAP_ADMODE_BEST; /* netmap_generic_mit controls mitigation of RX notifications for * the generic netmap adapter. The value is a time interval in * nanoseconds. */ int netmap_generic_mit = 100*1000; /* We use by default netmap-aware qdiscs with generic netmap adapters, * even if there can be a little performance hit with hardware NICs. * However, using the qdisc is the safer approach, for two reasons: * 1) it prevents non-fifo qdiscs to break the TX notification * scheme, which is based on mbuf destructors when txqdisc is * not used. * 2) it makes it possible to transmit over software devices that * change skb->dev, like bridge, veth, ... * * Anyway users looking for the best performance should * use native adapters. */ int netmap_generic_txqdisc = 1; /* Default number of slots and queues for generic adapters. */ int netmap_generic_ringsize = 1024; int netmap_generic_rings = 1; /* Non-zero if ptnet devices are allowed to use virtio-net headers. */ int ptnet_vnet_hdr = 1; /* * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated * in some other operating systems */ SYSBEGIN(main_init); SYSCTL_DECL(_dev_netmap); SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , ""); SYSEND; NMG_LOCK_T netmap_global_lock; /* * mark the ring as stopped, and run through the locks * to make sure other users get to see it. * stopped must be either NR_KR_STOPPED (for unbounded stop) * of NR_KR_LOCKED (brief stop for mutual exclusion purposes) */ static void netmap_disable_ring(struct netmap_kring *kr, int stopped) { nm_kr_stop(kr, stopped); // XXX check if nm_kr_stop is sufficient mtx_lock(&kr->q_lock); mtx_unlock(&kr->q_lock); nm_kr_put(kr); } /* stop or enable a single ring */ void netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped) { if (stopped) netmap_disable_ring(NMR(na, t) + ring_id, stopped); else NMR(na, t)[ring_id].nkr_stopped = 0; } /* stop or enable all the rings of na */ void netmap_set_all_rings(struct netmap_adapter *na, int stopped) { int i; enum txrx t; if (!nm_netmap_on(na)) return; for_rx_tx(t) { for (i = 0; i < netmap_real_rings(na, t); i++) { netmap_set_ring(na, i, t, stopped); } } } /* * Convenience function used in drivers. Waits for current txsync()s/rxsync()s * to finish and prevents any new one from starting. Call this before turning * netmap mode off, or before removing the hardware rings (e.g., on module * onload). */ void netmap_disable_all_rings(struct ifnet *ifp) { if (NM_NA_VALID(ifp)) { netmap_set_all_rings(NA(ifp), NM_KR_STOPPED); } } /* * Convenience function used in drivers. Re-enables rxsync and txsync on the * adapter's rings In linux drivers, this should be placed near each * napi_enable(). */ void netmap_enable_all_rings(struct ifnet *ifp) { if (NM_NA_VALID(ifp)) { netmap_set_all_rings(NA(ifp), 0 /* enabled */); } } void netmap_make_zombie(struct ifnet *ifp) { if (NM_NA_VALID(ifp)) { struct netmap_adapter *na = NA(ifp); netmap_set_all_rings(na, NM_KR_LOCKED); na->na_flags |= NAF_ZOMBIE; netmap_set_all_rings(na, 0); } } void netmap_undo_zombie(struct ifnet *ifp) { if (NM_NA_VALID(ifp)) { struct netmap_adapter *na = NA(ifp); if (na->na_flags & NAF_ZOMBIE) { netmap_set_all_rings(na, NM_KR_LOCKED); na->na_flags &= ~NAF_ZOMBIE; netmap_set_all_rings(na, 0); } } } /* * generic bound_checking function */ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) { u_int oldv = *v; const char *op = NULL; if (dflt < lo) dflt = lo; if (dflt > hi) dflt = hi; if (oldv < lo) { *v = dflt; op = "Bump"; } else if (oldv > hi) { *v = hi; op = "Clamp"; } if (op && msg) printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); return *v; } /* * packet-dump function, user-supplied or static buffer. * The destination buffer must be at least 30+4*len */ const char * nm_dump_buf(char *p, int len, int lim, char *dst) { static char _dst[8192]; int i, j, i0; static char hex[] ="0123456789abcdef"; char *o; /* output position */ #define P_HI(x) hex[((x) & 0xf0)>>4] #define P_LO(x) hex[((x) & 0xf)] #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') if (!dst) dst = _dst; if (lim <= 0 || lim > len) lim = len; o = dst; sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); o += strlen(o); /* hexdump routine */ for (i = 0; i < lim; ) { sprintf(o, "%5d: ", i); o += strlen(o); memset(o, ' ', 48); i0 = i; for (j=0; j < 16 && i < lim; i++, j++) { o[j*3] = P_HI(p[i]); o[j*3+1] = P_LO(p[i]); } i = i0; for (j=0; j < 16 && i < lim; i++, j++) o[j + 48] = P_C(p[i]); o[j+48] = '\n'; o += j+49; } *o = '\0'; #undef P_HI #undef P_LO #undef P_C return dst; } /* * Fetch configuration from the device, to cope with dynamic * reconfigurations after loading the module. */ /* call with NMG_LOCK held */ int netmap_update_config(struct netmap_adapter *na) { u_int txr, txd, rxr, rxd; txr = txd = rxr = rxd = 0; if (na->nm_config == NULL || na->nm_config(na, &txr, &txd, &rxr, &rxd)) { /* take whatever we had at init time */ txr = na->num_tx_rings; txd = na->num_tx_desc; rxr = na->num_rx_rings; rxd = na->num_rx_desc; } if (na->num_tx_rings == txr && na->num_tx_desc == txd && na->num_rx_rings == rxr && na->num_rx_desc == rxd) return 0; /* nothing changed */ if (netmap_verbose || na->active_fds > 0) { D("stored config %s: txring %d x %d, rxring %d x %d", na->name, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); D("new config %s: txring %d x %d, rxring %d x %d", na->name, txr, txd, rxr, rxd); } if (na->active_fds == 0) { D("configuration changed (but fine)"); na->num_tx_rings = txr; na->num_tx_desc = txd; na->num_rx_rings = rxr; na->num_rx_desc = rxd; return 0; } D("configuration changed while active, this is bad..."); return 1; } /* nm_sync callbacks for the host rings */ static int netmap_txsync_to_host(struct netmap_kring *kring, int flags); static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags); /* create the krings array and initialize the fields common to all adapters. * The array layout is this: * * +----------+ * na->tx_rings ----->| | \ * | | } na->num_tx_ring * | | / * +----------+ * | | host tx kring * na->rx_rings ----> +----------+ * | | \ * | | } na->num_rx_rings * | | / * +----------+ * | | host rx kring * +----------+ * na->tailroom ----->| | \ * | | } tailroom bytes * | | / * +----------+ * * Note: for compatibility, host krings are created even when not needed. * The tailroom space is currently used by vale ports for allocating leases. */ /* call with NMG_LOCK held */ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { u_int i, len, ndesc; struct netmap_kring *kring; u_int n[NR_TXRX]; enum txrx t; /* account for the (possibly fake) host rings */ n[NR_TX] = na->num_tx_rings + 1; n[NR_RX] = na->num_rx_rings + 1; len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom; na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); if (na->tx_rings == NULL) { D("Cannot allocate krings"); return ENOMEM; } na->rx_rings = na->tx_rings + n[NR_TX]; /* * All fields in krings are 0 except the one initialized below. * but better be explicit on important kring fields. */ for_rx_tx(t) { ndesc = nma_get_ndesc(na, t); for (i = 0; i < n[t]; i++) { kring = &NMR(na, t)[i]; bzero(kring, sizeof(*kring)); kring->na = na; kring->ring_id = i; kring->tx = t; kring->nkr_num_slots = ndesc; kring->nr_mode = NKR_NETMAP_OFF; kring->nr_pending_mode = NKR_NETMAP_OFF; if (i < nma_get_nrings(na, t)) { kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync); } else { kring->nm_sync = (t == NR_TX ? netmap_txsync_to_host: netmap_rxsync_from_host); } kring->nm_notify = na->nm_notify; kring->rhead = kring->rcur = kring->nr_hwcur = 0; /* * IMPORTANT: Always keep one slot empty. */ kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0); snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name, nm_txrx2str(t), i); ND("ktx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF); nm_os_selinfo_init(&kring->si); } nm_os_selinfo_init(&na->si[t]); } na->tailroom = na->rx_rings + n[NR_RX]; return 0; } /* undo the actions performed by netmap_krings_create */ /* call with NMG_LOCK held */ void netmap_krings_delete(struct netmap_adapter *na) { struct netmap_kring *kring = na->tx_rings; enum txrx t; for_rx_tx(t) nm_os_selinfo_uninit(&na->si[t]); /* we rely on the krings layout described above */ for ( ; kring != na->tailroom; kring++) { mtx_destroy(&kring->q_lock); nm_os_selinfo_uninit(&kring->si); } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = na->tailroom = NULL; } /* * Destructor for NIC ports. They also have an mbuf queue * on the rings connected to the host so we need to purge * them first. */ /* call with NMG_LOCK held */ void netmap_hw_krings_delete(struct netmap_adapter *na) { struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; ND("destroy sw mbq with len %d", mbq_len(q)); mbq_purge(q); mbq_safe_fini(q); netmap_krings_delete(na); } /* * Undo everything that was done in netmap_do_regif(). In particular, * call nm_register(ifp,0) to stop netmap mode on the interface and * revert to normal operation. */ /* call with NMG_LOCK held */ static void netmap_unset_ringid(struct netmap_priv_d *); static void netmap_krings_put(struct netmap_priv_d *); void netmap_do_unregif(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; NMG_LOCK_ASSERT(); na->active_fds--; /* unset nr_pending_mode and possibly release exclusive mode */ netmap_krings_put(priv); #ifdef WITH_MONITOR /* XXX check whether we have to do something with monitor * when rings change nr_mode. */ if (na->active_fds <= 0) { /* walk through all the rings and tell any monitor * that the port is going to exit netmap mode */ netmap_monitor_stop(na); } #endif if (na->active_fds <= 0 || nm_kring_pending(priv)) { na->nm_register(na, 0); } /* delete rings and buffers that are no longer needed */ netmap_mem_rings_delete(na); if (na->active_fds <= 0) { /* last instance */ /* * (TO CHECK) We enter here * when the last reference to this file descriptor goes * away. This means we cannot have any pending poll() * or interrupt routine operating on the structure. * XXX The file may be closed in a thread while * another thread is using it. * Linux keeps the file opened until the last reference * by any outstanding ioctl/poll or mmap is gone. * FreeBSD does not track mmap()s (but we do) and * wakes up any sleeping poll(). Need to check what * happens if the close() occurs while a concurrent * syscall is running. */ if (netmap_verbose) D("deleting last instance for %s", na->name); if (nm_netmap_on(na)) { D("BUG: netmap on while going to delete the krings"); } na->nm_krings_delete(na); } /* possibily decrement counter of tx_si/rx_si users */ netmap_unset_ringid(priv); /* delete the nifp */ netmap_mem_if_delete(na, priv->np_nifp); /* drop the allocator */ netmap_mem_deref(na->nm_mem, na); /* mark the priv as unregistered */ priv->np_na = NULL; priv->np_nifp = NULL; } /* call with NMG_LOCK held */ static __inline int nm_si_user(struct netmap_priv_d *priv, enum txrx t) { return (priv->np_na != NULL && (priv->np_qlast[t] - priv->np_qfirst[t] > 1)); } struct netmap_priv_d* netmap_priv_new(void) { struct netmap_priv_d *priv; priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, M_NOWAIT | M_ZERO); if (priv == NULL) return NULL; priv->np_refs = 1; nm_os_get_module(); return priv; } /* * Destructor of the netmap_priv_d, called when the fd is closed * Action: undo all the things done by NIOCREGIF, * On FreeBSD we need to track whether there are active mmap()s, * and we use np_active_mmaps for that. On linux, the field is always 0. * Return: 1 if we can free priv, 0 otherwise. * */ /* call with NMG_LOCK held */ void netmap_priv_delete(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; /* number of active references to this fd */ if (--priv->np_refs > 0) { return; } nm_os_put_module(); if (na) { netmap_do_unregif(priv); } netmap_unget_na(na, priv->np_ifp); bzero(priv, sizeof(*priv)); /* for safety */ free(priv, M_DEVBUF); } /* call with NMG_LOCK *not* held */ void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; NMG_LOCK(); netmap_priv_delete(priv); NMG_UNLOCK(); } /* * Handlers for synchronization of the queues from/to the host. * Netmap has two operating modes: * - in the default mode, the rings connected to the host stack are * just another ring pair managed by userspace; * - in transparent mode (XXX to be defined) incoming packets * (from the host or the NIC) are marked as NS_FORWARD upon * arrival, and the user application has a chance to reset the * flag for packets that should be dropped. * On the RXSYNC or poll(), packets in RX rings between * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved * to the other side. * The transfer NIC --> host is relatively easy, just encapsulate * into mbufs and we are done. The host --> NIC side is slightly * harder because there might not be room in the tx ring so it * might take a while before releasing the buffer. */ /* * pass a chain of buffers to the host stack as coming from 'dst' * We do not need to lock because the queue is private. */ static void netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; struct mbuf *head = NULL, *prev = NULL; /* send packets up, outside the lock */ while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); prev = nm_os_send_up(dst, m, prev); if (head == NULL) head = prev; } if (head) nm_os_send_up(dst, NULL, head); mbq_fini(q); } /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) * and pass them up. Drop remaining packets in the unlikely event * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; u_int n; struct netmap_adapter *na = kring->na; for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; if ((slot->flags & NS_FORWARD) == 0 && !force) continue; if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) { RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? /* XXX TODO: adapt to the case of a multisegment packet */ m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; mbq_enqueue(q, m); } } static inline int _nm_may_forward(struct netmap_kring *kring) { return ((netmap_fwd || kring->ring->flags & NR_FORWARD) && kring->na->na_flags & NAF_HOST_RINGS && kring->tx == NR_RX); } static inline int nm_may_forward_up(struct netmap_kring *kring) { return _nm_may_forward(kring) && kring->ring_id != kring->na->num_rx_rings; } static inline int nm_may_forward_down(struct netmap_kring *kring) { return _nm_may_forward(kring) && kring->ring_id == kring->na->num_rx_rings; } /* * Send to the NIC rings packets marked NS_FORWARD between * kring->nr_hwcur and kring->rhead * Called under kring->rx_queue.lock on the sw rx ring, */ static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_slot *rxslot = kring->ring->slot; u_int i, rxcur = kring->nr_hwcur; u_int const head = kring->rhead; u_int const src_lim = kring->nkr_num_slots - 1; u_int sent = 0; /* scan rings to find space, then fill as much as possible */ for (i = 0; i < na->num_tx_rings; i++) { struct netmap_kring *kdst = &na->tx_rings[i]; struct netmap_ring *rdst = kdst->ring; u_int const dst_lim = kdst->nkr_num_slots - 1; /* XXX do we trust ring or kring->rcur,rtail ? */ for (; rxcur != head && !nm_ring_empty(rdst); rxcur = nm_next(rxcur, src_lim) ) { struct netmap_slot *src, *dst, tmp; u_int dst_head = rdst->head; src = &rxslot[rxcur]; if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) continue; sent++; dst = &rdst->slot[dst_head]; tmp = *src; src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; rdst->head = rdst->cur = nm_next(dst_head, dst_lim); } /* if (sent) XXX txsync ? */ } return sent; } /* * netmap_txsync_to_host() passes packets up. We are called from a * system call in user process context, and the only contention * can be among multiple user threads erroneously calling * this routine concurrently. */ static int netmap_txsync_to_host(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; struct mbq q; /* Take packets from hwcur to head and pass them up. * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ mbq_init(&q); netmap_grab_packets(kring, &q, 1 /* force */); ND("have %d pkts in queue", mbq_len(&q)); kring->nr_hwcur = head; kring->nr_hwtail = head + lim; if (kring->nr_hwtail > lim) kring->nr_hwtail -= lim + 1; netmap_send_up(na->ifp, &q); return 0; } /* * rxsync backend for packets coming from the host stack. * They have been put in kring->rx_queue by netmap_transmit(). * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because sr != NULL). * * returns the number of packets delivered to tx queues in * transparent mode, or a negative value if error */ static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; u_int nm_i, n; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; int ret = 0; struct mbq *q = &kring->rx_queue, fq; mbq_init(&fq); /* fq holds packets to be freed */ mbq_lock(q); /* First part: import newly received packets */ n = mbq_len(q); if (n) { /* grab packets from the queue */ struct mbuf *m; uint32_t stop_i; nm_i = kring->nr_hwtail; stop_i = nm_prev(nm_i, lim); while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { int len = MBUF_LEN(m); struct netmap_slot *slot = &ring->slot[nm_i]; m_copydata(m, 0, len, NMB(na, slot)); ND("nm %d len %d", nm_i, len); if (netmap_verbose) D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL)); slot->len = len; slot->flags = kring->nkr_slot_flags; nm_i = nm_next(nm_i, lim); mbq_enqueue(&fq, m); } kring->nr_hwtail = nm_i; } /* * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* something was released */ if (nm_may_forward_down(kring)) { ret = netmap_sw_to_nic(na); if (ret > 0) { kring->nr_kflags |= NR_FORWARD; ret = 0; } } kring->nr_hwcur = head; } mbq_unlock(q); mbq_purge(&fq); mbq_fini(&fq); return ret; } /* Get a netmap adapter for the port. * * If it is possible to satisfy the request, return 0 * with *na containing the netmap adapter found. * Otherwise return an error code, with *na containing NULL. * * When the port is attached to a bridge, we always return * EBUSY. * Otherwise, if the port is already bound to a file descriptor, * then we unconditionally return the existing adapter into *na. * In all the other cases, we return (into *na) either native, * generic or NULL, according to the following table: * * native_support * active_fds dev.netmap.admode YES NO * ------------------------------------------------------- * >0 * NA(ifp) NA(ifp) * * 0 NETMAP_ADMODE_BEST NATIVE GENERIC * 0 NETMAP_ADMODE_NATIVE NATIVE NULL * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC * */ static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */ int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) { /* generic support */ int i = netmap_admode; /* Take a snapshot. */ struct netmap_adapter *prev_na; int error = 0; *na = NULL; /* default */ /* reset in case of invalid value */ if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) i = netmap_admode = NETMAP_ADMODE_BEST; if (NM_NA_VALID(ifp)) { prev_na = NA(ifp); /* If an adapter already exists, return it if * there are active file descriptors or if * netmap is not forced to use generic * adapters. */ if (NETMAP_OWNED_BY_ANY(prev_na) || i != NETMAP_ADMODE_GENERIC || prev_na->na_flags & NAF_FORCE_NATIVE #ifdef WITH_PIPES /* ugly, but we cannot allow an adapter switch * if some pipe is referring to this one */ || prev_na->na_next_pipe > 0 #endif ) { *na = prev_na; return 0; } } /* If there isn't native support and netmap is not allowed * to use generic adapters, we cannot satisfy the request. */ if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE) return EOPNOTSUPP; /* Otherwise, create a generic adapter and return it, * saving the previously used netmap adapter, if any. * * Note that here 'prev_na', if not NULL, MUST be a * native adapter, and CANNOT be a generic one. This is * true because generic adapters are created on demand, and * destroyed when not used anymore. Therefore, if the adapter * currently attached to an interface 'ifp' is generic, it * must be that * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). * Consequently, if NA(ifp) is generic, we will enter one of * the branches above. This ensures that we never override * a generic adapter with another generic adapter. */ error = generic_netmap_attach(ifp); if (error) return error; *na = NA(ifp); return 0; } /* * MUST BE CALLED UNDER NMG_LOCK() * * Get a refcounted reference to a netmap adapter attached * to the interface specified by nmr. * This is always called in the execution of an ioctl(). * * Return ENXIO if the interface specified by the request does * not exist, ENOTSUP if netmap is not supported by the interface, * EBUSY if the interface is already attached to a bridge, * EINVAL if parameters are invalid, ENOMEM if needed resources * could not be allocated. * If successful, hold a reference to the netmap adapter. * * If the interface specified by nmr is a system one, also keep * a reference to it and return a valid *ifp. */ int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, struct ifnet **ifp, int create) { int error = 0; struct netmap_adapter *ret = NULL; *na = NULL; /* default return value */ *ifp = NULL; NMG_LOCK_ASSERT(); /* We cascade through all possible types of netmap adapter. * All netmap_get_*_na() functions return an error and an na, * with the following combinations: * * error na * 0 NULL type doesn't match * !0 NULL type matches, but na creation/lookup failed * 0 !NULL type matches and na created/found * !0 !NULL impossible */ /* try to see if this is a ptnetmap port */ error = netmap_get_pt_host_na(nmr, na, create); if (error || *na != NULL) return error; /* try to see if this is a monitor port */ error = netmap_get_monitor_na(nmr, na, create); if (error || *na != NULL) return error; /* try to see if this is a pipe port */ error = netmap_get_pipe_na(nmr, na, create); if (error || *na != NULL) return error; /* try to see if this is a bridge port */ error = netmap_get_bdg_na(nmr, na, create); if (error) return error; if (*na != NULL) /* valid match in netmap_get_bdg_na() */ goto out; /* * This must be a hardware na, lookup the name in the system. * Note that by hardware we actually mean "it shows up in ifconfig". * This may still be a tap, a veth/epair, or even a * persistent VALE port. */ *ifp = ifunit_ref(nmr->nr_name); if (*ifp == NULL) { return ENXIO; } error = netmap_get_hw_na(*ifp, &ret); if (error) goto out; *na = ret; netmap_adapter_get(ret); out: if (error) { if (ret) netmap_adapter_put(ret); if (*ifp) { if_rele(*ifp); *ifp = NULL; } } return error; } /* undo netmap_get_na() */ void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp) { if (ifp) if_rele(ifp); if (na) netmap_adapter_put(na); } #define NM_FAIL_ON(t) do { \ if (unlikely(t)) { \ RD(5, "%s: fail '" #t "' " \ "h %d c %d t %d " \ "rh %d rc %d rt %d " \ "hc %d ht %d", \ kring->name, \ head, cur, ring->tail, \ kring->rhead, kring->rcur, kring->rtail, \ kring->nr_hwcur, kring->nr_hwtail); \ return kring->nkr_num_slots; \ } \ } while (0) /* * validate parameters on entry for *_txsync() * Returns ring->cur if ok, or something >= kring->nkr_num_slots * in case of error. * * rhead, rcur and rtail=hwtail are stored from previous round. * hwcur is the next packet to send to the ring. * * We want * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail * * hwcur, rhead, rtail and hwtail are reliable */ u_int nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { u_int head = ring->head; /* read only once */ u_int cur = ring->cur; /* read only once */ u_int n = kring->nkr_num_slots; ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", kring->name, kring->nr_hwcur, kring->nr_hwtail, ring->head, ring->cur, ring->tail); #if 1 /* kernel sanity checks; but we can trust the kring. */ NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n || kring->rtail >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* * user sanity checks. We only use head, * A, B, ... are possible positions for head: * * 0 A rhead B rtail C n-1 * 0 D rtail E rhead F n-1 * * B, F, D are valid. A, C, E are wrong */ if (kring->rtail >= kring->rhead) { /* want rhead <= head <= rtail */ NM_FAIL_ON(head < kring->rhead || head > kring->rtail); /* and also head <= cur <= rtail */ NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* here rtail < rhead */ /* we need head outside rtail .. rhead */ NM_FAIL_ON(head > kring->rtail && head < kring->rhead); /* two cases now: head <= rtail or head >= rhead */ if (head <= kring->rtail) { /* want head <= cur <= rtail */ NM_FAIL_ON(cur < head || cur > kring->rtail); } else { /* head >= rhead */ /* cur must be outside rtail..head */ NM_FAIL_ON(cur > kring->rtail && cur < head); } } if (ring->tail != kring->rtail) { RD(5, "%s tail overwritten was %d need %d", kring->name, ring->tail, kring->rtail); ring->tail = kring->rtail; } kring->rhead = head; kring->rcur = cur; return head; } /* * validate parameters on entry for *_rxsync() * Returns ring->head if ok, kring->nkr_num_slots on error. * * For a valid configuration, * hwcur <= head <= cur <= tail <= hwtail * * We only consider head and cur. * hwcur and hwtail are reliable. * */ u_int nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring) { uint32_t const n = kring->nkr_num_slots; uint32_t head, cur; ND(5,"%s kc %d kt %d h %d c %d t %d", kring->name, kring->nr_hwcur, kring->nr_hwtail, ring->head, ring->cur, ring->tail); /* * Before storing the new values, we should check they do not * move backwards. However: * - head is not an issue because the previous value is hwcur; * - cur could in principle go back, however it does not matter * because we are processing a brand new rxsync() */ cur = kring->rcur = ring->cur; /* read only once */ head = kring->rhead = ring->head; /* read only once */ #if 1 /* kernel sanity checks */ NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n); #endif /* kernel sanity checks */ /* user sanity checks */ if (kring->nr_hwtail >= kring->nr_hwcur) { /* want hwcur <= rhead <= hwtail */ NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail); /* and also rhead <= rcur <= hwtail */ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* we need rhead outside hwtail..hwcur */ NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail); /* two cases now: head <= hwtail or head >= hwcur */ if (head <= kring->nr_hwtail) { /* want head <= cur <= hwtail */ NM_FAIL_ON(cur < head || cur > kring->nr_hwtail); } else { /* cur must be outside hwtail..head */ NM_FAIL_ON(cur < head && cur > kring->nr_hwtail); } } if (ring->tail != kring->rtail) { RD(5, "%s tail overwritten was %d need %d", kring->name, ring->tail, kring->rtail); ring->tail = kring->rtail; } return head; } /* * Error routine called when txsync/rxsync detects an error. * Can't do much more than resetting head =cur = hwcur, tail = hwtail * Return 1 on reinit. * * This routine is only called by the upper half of the kernel. * It only reads hwcur (which is changed only by the upper half, too) * and hwtail (which may be changed by the lower half, but only on * a tx ring and only to increase it, so any error will be recovered * on the next call). For the above, we don't strictly need to call * it under lock. */ int netmap_ring_reinit(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; // XXX KASSERT nm_kr_tryget RD(10, "called for %s", kring->name); // XXX probably wrong to trust userspace kring->rhead = ring->head; kring->rcur = ring->cur; kring->rtail = ring->tail; if (ring->cur > lim) errors++; if (ring->head > lim) errors++; if (ring->tail > lim) errors++; for (i = 0; i <= lim; i++) { u_int idx = ring->slot[i].buf_idx; u_int len = ring->slot[i].len; if (idx < 2 || idx >= kring->na->na_lut.objtotal) { RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; } else if (len > NETMAP_BUF_SIZE(kring->na)) { ring->slot[i].len = 0; RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } } if (errors) { RD(10, "total %d errors", errors); RD(10, "%s reinit, cur %d -> %d tail %d -> %d", kring->name, ring->cur, kring->nr_hwcur, ring->tail, kring->nr_hwtail); ring->head = kring->rhead = kring->nr_hwcur; ring->cur = kring->rcur = kring->nr_hwcur; ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } /* interpret the ringid and flags fields of an nmreq, by translating them * into a pair of intervals of ring indices: * * [priv->np_txqfirst, priv->np_txqlast) and * [priv->np_rxqfirst, priv->np_rxqlast) * */ int netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { struct netmap_adapter *na = priv->np_na; u_int j, i = ringid & NETMAP_RING_MASK; u_int reg = flags & NR_REG_MASK; int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY }; enum txrx t; if (reg == NR_REG_DEFAULT) { /* convert from old ringid to flags */ if (ringid & NETMAP_SW_RING) { reg = NR_REG_SW; } else if (ringid & NETMAP_HW_RING) { reg = NR_REG_ONE_NIC; } else { reg = NR_REG_ALL_NIC; } D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } if ((flags & NR_PTNETMAP_HOST) && (reg != NR_REG_ALL_NIC || flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) { D("Error: only NR_REG_ALL_NIC supported with netmap passthrough"); return EINVAL; } for_rx_tx(t) { if (flags & excluded_direction[t]) { priv->np_qfirst[t] = priv->np_qlast[t] = 0; continue; } switch (reg) { case NR_REG_ALL_NIC: case NR_REG_PIPE_MASTER: case NR_REG_PIPE_SLAVE: priv->np_qfirst[t] = 0; priv->np_qlast[t] = nma_get_nrings(na, t); ND("ALL/PIPE: %s %d %d", nm_txrx2str(t), priv->np_qfirst[t], priv->np_qlast[t]); break; case NR_REG_SW: case NR_REG_NIC_SW: if (!(na->na_flags & NAF_HOST_RINGS)) { D("host rings not supported"); return EINVAL; } priv->np_qfirst[t] = (reg == NR_REG_SW ? nma_get_nrings(na, t) : 0); priv->np_qlast[t] = nma_get_nrings(na, t) + 1; ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", nm_txrx2str(t), priv->np_qfirst[t], priv->np_qlast[t]); break; case NR_REG_ONE_NIC: if (i >= na->num_tx_rings && i >= na->num_rx_rings) { D("invalid ring id %d", i); return EINVAL; } /* if not enough rings, use the first one */ j = i; if (j >= nma_get_nrings(na, t)) j = 0; priv->np_qfirst[t] = j; priv->np_qlast[t] = j + 1; ND("ONE_NIC: %s %d %d", nm_txrx2str(t), priv->np_qfirst[t], priv->np_qlast[t]); break; default: D("invalid regif type %d", reg); return EINVAL; } } priv->np_flags = (flags & ~NR_REG_MASK) | reg; if (netmap_verbose) { D("%s: tx [%d,%d) rx [%d,%d) id %d", na->name, priv->np_qfirst[NR_TX], priv->np_qlast[NR_TX], priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX], i); } return 0; } /* * Set the ring ID. For devices with a single queue, a request * for all rings is the same as a single ring. */ static int netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { struct netmap_adapter *na = priv->np_na; int error; enum txrx t; error = netmap_interp_ringid(priv, ringid, flags); if (error) { return error; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; /* optimization: count the users registered for more than * one ring, which are the ones sleeping on the global queue. * The default netmap_notify() callback will then * avoid signaling the global queue if nobody is using it */ for_rx_tx(t) { if (nm_si_user(priv, t)) na->si_users[t]++; } return 0; } static void netmap_unset_ringid(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; enum txrx t; for_rx_tx(t) { if (nm_si_user(priv, t)) na->si_users[t]--; priv->np_qfirst[t] = priv->np_qlast[t] = 0; } priv->np_flags = 0; priv->np_txpoll = 0; } /* Set the nr_pending_mode for the requested rings. * If requested, also try to get exclusive access to the rings, provided * the rings we want to bind are not exclusively owned by a previous bind. */ static int netmap_krings_get(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; struct netmap_kring *kring; int excl = (priv->np_flags & NR_EXCLUSIVE); enum txrx t; ND("%s: grabbing tx [%d, %d) rx [%d, %d)", na->name, priv->np_qfirst[NR_TX], priv->np_qlast[NR_TX], priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]); /* first round: check that all the requested rings * are neither alread exclusively owned, nor we * want exclusive ownership when they are already in use */ for_rx_tx(t) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; if ((kring->nr_kflags & NKR_EXCLUSIVE) || (kring->users && excl)) { ND("ring %s busy", kring->name); return EBUSY; } } } /* second round: increment usage count (possibly marking them * as exclusive) and set the nr_pending_mode */ for_rx_tx(t) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; kring->users++; if (excl) kring->nr_kflags |= NKR_EXCLUSIVE; kring->nr_pending_mode = NKR_NETMAP_ON; } } return 0; } /* Undo netmap_krings_get(). This is done by clearing the exclusive mode * if was asked on regif, and unset the nr_pending_mode if we are the * last users of the involved rings. */ static void netmap_krings_put(struct netmap_priv_d *priv) { struct netmap_adapter *na = priv->np_na; u_int i; struct netmap_kring *kring; int excl = (priv->np_flags & NR_EXCLUSIVE); enum txrx t; ND("%s: releasing tx [%d, %d) rx [%d, %d)", na->name, priv->np_qfirst[NR_TX], priv->np_qlast[NR_TX], priv->np_qfirst[NR_RX], priv->np_qlast[MR_RX]); for_rx_tx(t) { for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; if (excl) kring->nr_kflags &= ~NKR_EXCLUSIVE; kring->users--; if (kring->users == 0) kring->nr_pending_mode = NKR_NETMAP_OFF; } } } /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. * This must be called with NMG_LOCK held. * * The following na callbacks are called in the process: * * na->nm_config() [by netmap_update_config] * (get current number and size of rings) * * We have a generic one for linux (netmap_linux_config). * The bwrap has to override this, since it has to forward * the request to the wrapped adapter (netmap_bwrap_config). * * * na->nm_krings_create() * (create and init the krings array) * * One of the following: * * * netmap_hw_krings_create, (hw ports) * creates the standard layout for the krings * and adds the mbq (used for the host rings). * * * netmap_vp_krings_create (VALE ports) * add leases and scratchpads * * * netmap_pipe_krings_create (pipes) * create the krings and rings of both ends and * cross-link them * * * netmap_monitor_krings_create (monitors) * avoid allocating the mbq * * * netmap_bwrap_krings_create (bwraps) * create both the brap krings array, * the krings array of the wrapped adapter, and * (if needed) the fake array for the host adapter * * na->nm_register(, 1) * (put the adapter in netmap mode) * * This may be one of the following: * * * netmap_hw_reg (hw ports) * checks that the ifp is still there, then calls * the hardware specific callback; * * * netmap_vp_reg (VALE ports) * If the port is connected to a bridge, * set the NAF_NETMAP_ON flag under the * bridge write lock. * * * netmap_pipe_reg (pipes) * inform the other pipe end that it is no * longer responsible for the lifetime of this * pipe end * * * netmap_monitor_reg (monitors) * intercept the sync callbacks of the monitored * rings * * * netmap_bwrap_reg (bwraps) * cross-link the bwrap and hwna rings, * forward the request to the hwna, override * the hwna notify callback (to get the frames * coming from outside go through the bridge). * * */ int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, uint32_t flags) { struct netmap_if *nifp = NULL; int error; NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); priv->np_na = na; /* store the reference */ error = netmap_set_ringid(priv, ringid, flags); if (error) goto err; error = netmap_mem_finalize(na->nm_mem, na); if (error) goto err; if (na->active_fds == 0) { /* * If this is the first registration of the adapter, * create the in-kernel view of the netmap rings, * the netmap krings. */ /* * Depending on the adapter, this may also create * the netmap rings themselves */ error = na->nm_krings_create(na); if (error) goto err_drop_mem; } /* now the krings must exist and we can check whether some * previous bind has exclusive ownership on them, and set * nr_pending_mode */ error = netmap_krings_get(priv); if (error) goto err_del_krings; /* create all needed missing netmap rings */ error = netmap_mem_rings_create(na); if (error) goto err_rel_excl; /* in all cases, create a new netmap if */ nifp = netmap_mem_if_new(na); if (nifp == NULL) { error = ENOMEM; goto err_del_rings; } if (na->active_fds == 0) { /* cache the allocator info in the na */ error = netmap_mem_get_lut(na->nm_mem, &na->na_lut); if (error) goto err_del_if; ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal, na->na_lut.objsize); } if (nm_kring_pending(priv)) { /* Some kring is switching mode, tell the adapter to * react on this. */ error = na->nm_register(na, 1); if (error) goto err_put_lut; } /* Commit the reference. */ na->active_fds++; /* * advertise that the interface is ready by setting np_nifp. * The barrier is needed because readers (poll, *SYNC and mmap) * check for priv->np_nifp != NULL without locking */ mb(); /* make sure previous writes are visible to all CPUs */ priv->np_nifp = nifp; return 0; err_put_lut: if (na->active_fds == 0) memset(&na->na_lut, 0, sizeof(na->na_lut)); err_del_if: netmap_mem_if_delete(na, nifp); err_rel_excl: netmap_krings_put(priv); err_del_rings: netmap_mem_rings_delete(na); err_del_krings: if (na->active_fds == 0) na->nm_krings_delete(na); err_drop_mem: netmap_mem_deref(na->nm_mem, na); err: priv->np_na = NULL; return error; } /* * update kring and ring at the end of rxsync/txsync. */ static inline void nm_sync_finalize(struct netmap_kring *kring) { /* * Update ring tail to what the kernel knows * After txsync: head/rhead/hwcur might be behind cur/rcur * if no carrier. */ kring->ring->tail = kring->rtail = kring->nr_hwtail; ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", kring->name, kring->nr_hwcur, kring->nr_hwtail, kring->rhead, kring->rcur, kring->rtail); } /* * ioctl(2) support for the "netmap" device. * * Following a list of accepted commands: * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF * - NIOCTXSYNC * - NIOCRXSYNC * * Return 0 on success, errno otherwise. */ int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td) { struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; struct ifnet *ifp = NULL; int error = 0; u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; enum txrx t; if (cmd == NIOCGINFO || cmd == NIOCREGIF) { /* truncate name */ nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; if (nmr->nr_version != NETMAP_API) { D("API mismatch for %s got %d need %d", nmr->nr_name, nmr->nr_version, NETMAP_API); nmr->nr_version = NETMAP_API; } if (nmr->nr_version < NETMAP_MIN_API || nmr->nr_version > NETMAP_MAX_API) { return EINVAL; } } switch (cmd) { case NIOCGINFO: /* return capabilities etc */ if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; } NMG_LOCK(); do { /* memsize is always valid */ struct netmap_mem_d *nmd = &nm_mem; u_int memflags; if (nmr->nr_name[0] != '\0') { /* get a refcount */ error = netmap_get_na(nmr, &na, &ifp, 1 /* create */); if (error) { na = NULL; ifp = NULL; break; } nmd = na->nm_mem; /* get memory allocator */ } error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, &nmr->nr_arg2); if (error) break; if (na == NULL) /* only memory info */ break; nmr->nr_offset = 0; nmr->nr_rx_slots = nmr->nr_tx_slots = 0; netmap_update_config(na); nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; } while (0); netmap_unget_na(na, ifp); NMG_UNLOCK(); break; case NIOCREGIF: - /* possibly attach/detach NIC and VALE switch */ + /* + * If nmr->nr_cmd is not zero, this NIOCREGIF is not really + * a regif operation, but a different one, specified by the + * value of nmr->nr_cmd. + */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH || i == NETMAP_BDG_VNET_HDR || i == NETMAP_BDG_NEWIF || i == NETMAP_BDG_DELIF || i == NETMAP_BDG_POLLING_ON || i == NETMAP_BDG_POLLING_OFF) { + /* possibly attach/detach NIC and VALE switch */ error = netmap_bdg_ctl(nmr, NULL); break; } else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) { + /* forward the command to the ptnetmap subsystem */ error = ptnetmap_ctl(nmr, priv->np_na); break; } else if (i == NETMAP_VNET_HDR_GET) { + /* get vnet-header length for this netmap port */ struct ifnet *ifp; NMG_LOCK(); error = netmap_get_na(nmr, &na, &ifp, 0); if (na && !error) { nmr->nr_arg1 = na->virt_hdr_len; } netmap_unget_na(na, ifp); NMG_UNLOCK(); break; + } else if (i == NETMAP_POOLS_INFO_GET) { + /* get information from the memory allocator */ + error = netmap_mem_pools_info_get(nmr, priv->np_na); + break; } else if (i != 0) { D("nr_cmd must be 0 not %d", i); error = EINVAL; break; } /* protect access to priv from concurrent NIOCREGIF */ NMG_LOCK(); do { u_int memflags; struct ifnet *ifp; if (priv->np_nifp != NULL) { /* thread already registered */ error = EBUSY; break; } /* find the interface and a reference */ error = netmap_get_na(nmr, &na, &ifp, 1 /* create */); /* keep reference */ if (error) break; if (NETMAP_OWNED_BY_KERN(na)) { netmap_unget_na(na, ifp); error = EBUSY; break; } if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) { netmap_unget_na(na, ifp); error = EIO; break; } error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags); if (error) { /* reg. failed, release priv and ref */ netmap_unget_na(na, ifp); break; } nifp = priv->np_nifp; priv->np_td = td; // XXX kqueue, debugging only /* return the offset of the netmap_if object */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, &nmr->nr_arg2); if (error) { netmap_do_unregif(priv); netmap_unget_na(na, ifp); break; } if (memflags & NETMAP_MEM_PRIVATE) { *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; } for_rx_tx(t) { priv->np_si[t] = nm_si_user(priv, t) ? &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si; } if (nmr->nr_arg3) { if (netmap_verbose) D("requested %d extra buffers", nmr->nr_arg3); nmr->nr_arg3 = netmap_extra_alloc(na, &nifp->ni_bufs_head, nmr->nr_arg3); if (netmap_verbose) D("got %d extra buffers", nmr->nr_arg3); } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); /* store ifp reference so that priv destructor may release it */ priv->np_ifp = ifp; } while (0); NMG_UNLOCK(); break; case NIOCTXSYNC: case NIOCRXSYNC: nifp = priv->np_nifp; if (nifp == NULL) { error = ENXIO; break; } mb(); /* make sure following reads are not from cache */ na = priv->np_na; /* we have a reference */ if (na == NULL) { D("Internal error: nifp != NULL && na == NULL"); error = ENXIO; break; } t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX); krings = NMR(na, t); qfirst = priv->np_qfirst[t]; qlast = priv->np_qlast[t]; for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; struct netmap_ring *ring = kring->ring; if (unlikely(nm_kr_tryget(kring, 1, &error))) { error = (error ? EIO : 0); continue; } if (cmd == NIOCTXSYNC) { if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", i, ring->cur, kring->nr_hwcur); if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) { nm_sync_finalize(kring); } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, ring->cur, kring->nr_hwcur); } else { if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) { nm_sync_finalize(kring); } microtime(&ring->ts); } nm_kr_put(kring); } break; #ifdef WITH_VALE case NIOCCONFIG: error = netmap_bdg_config(nmr); break; #endif #ifdef __FreeBSD__ case FIONBIO: case FIOASYNC: ND("FIONBIO/FIOASYNC are no-ops"); break; case BIOCIMMEDIATE: case BIOCGHDRCMPLT: case BIOCSHDRCMPLT: case BIOCSSEESENT: D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); break; default: /* allow device-specific ioctls */ { struct ifnet *ifp = ifunit_ref(nmr->nr_name); if (ifp == NULL) { error = ENXIO; } else { struct socket so; bzero(&so, sizeof(so)); so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); if_rele(ifp); } break; } #else /* linux */ default: error = EOPNOTSUPP; #endif /* linux */ } return (error); } /* * select(2) and poll(2) handlers for the "netmap" device. * * Can be called for one or more queues. * Return true the event mask corresponding to ready events. * If there are no ready events, do a selrecord on either individual * selinfo or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. * * On linux, arguments are really pwait, the poll table, and 'td' is struct file * * The first one is remapped to pwait as selrecord() uses the name as an * hidden argument. */ int netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr) { struct netmap_adapter *na; struct netmap_kring *kring; struct netmap_ring *ring; u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0; #define want_tx want[NR_TX] #define want_rx want[NR_RX] struct mbq q; /* packets from hw queues to host stack */ enum txrx t; /* * In order to avoid nested locks, we need to "double check" * txsync and rxsync if we decide to do a selrecord(). * retry_tx (and retry_rx, later) prevent looping forever. */ int retry_tx = 1, retry_rx = 1; /* transparent mode: send_down is 1 if we have found some * packets to forward during the rx scan and we have not * sent them down to the nic yet */ int send_down = 0; mbq_init(&q); if (priv->np_nifp == NULL) { D("No if registered"); return POLLERR; } mb(); /* make sure following reads are not from cache */ na = priv->np_na; if (!nm_netmap_on(na)) return POLLERR; if (netmap_verbose & 0x8000) D("device %s events 0x%x", na->name, events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); /* * check_all_{tx|rx} are set if the card has more than one queue AND * the file descriptor is bound to all of them. If so, we sleep on * the "global" selinfo, otherwise we sleep on individual selinfo * (FreeBSD only allows two selinfo's per file descriptor). * The interrupt routine in the driver wake one or the other * (or both) depending on which clients are active. * * rxsync() is only called if we run out of buffers on a POLLIN. * txsync() is called if we run out of buffers on POLLOUT, or * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ check_all_tx = nm_si_user(priv, NR_TX); check_all_rx = nm_si_user(priv, NR_RX); /* * We start with a lock free round which is cheap if we have * slots available. If this fails, then lock and call the sync * routines. */ #if 1 /* new code- call rx if any of the ring needs to release or read buffers */ if (want_tx) { t = NR_TX; for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { revents |= want[t]; want[t] = 0; /* also breaks the loop */ } } } if (want_rx) { want_rx = 0; /* look for a reason to run the handlers */ t = NR_RX; for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */ || kring->rhead != kring->ring->head /* release buffers */) { want_rx = 1; } } if (!want_rx) revents |= events & (POLLIN | POLLRDNORM); /* we have data */ } #else /* old code */ for_rx_tx(t) { for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) { kring = &NMR(na, t)[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { revents |= want[t]; want[t] = 0; /* also breaks the loop */ } } } #endif /* old code */ /* * If we want to push packets out (priv->np_txpoll) or * want_tx is still set, we must issue txsync calls * (on all rings, to avoid that the tx rings stall). * XXX should also check cur != hwcur on the tx rings. * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { /* * The first round checks if anyone is ready, if not * do a selrecord and another round to handle races. * want_tx goes to 0 if any space is found, and is * used to skip rings with no pending transmissions. */ flush_tx: for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) { int found = 0; kring = &na->tx_rings[i]; ring = kring->ring; if (!send_down && !want_tx && ring->cur == kring->nr_hwcur) continue; if (nm_kr_tryget(kring, 1, &revents)) continue; if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } else { if (kring->nm_sync(kring, 0)) revents |= POLLERR; else nm_sync_finalize(kring); } /* * If we found new slots, notify potential * listeners on the same ring. * Since we just did a txsync, look at the copies * of cur,tail in the kring. */ found = kring->rcur != kring->rtail; nm_kr_put(kring); if (found) { /* notify other listeners */ revents |= want_tx; want_tx = 0; kring->nm_notify(kring, 0); } } /* if there were any packet to forward we must have handled them by now */ send_down = 0; if (want_tx && retry_tx && sr) { nm_os_selrecord(sr, check_all_tx ? &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si); retry_tx = 0; goto flush_tx; } } /* * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { /* two rounds here for race avoidance */ do_retry_rx: for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) { int found = 0; kring = &na->rx_rings[i]; ring = kring->ring; if (unlikely(nm_kr_tryget(kring, 1, &revents))) continue; if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); revents |= POLLERR; } /* now we can use kring->rcur, rtail */ /* * transparent mode support: collect packets * from the rxring(s). */ if (nm_may_forward_up(kring)) { ND(10, "forwarding some buffers up %d to %d", kring->nr_hwcur, ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } kring->nr_kflags &= ~NR_FORWARD; if (kring->nm_sync(kring, 0)) revents |= POLLERR; else nm_sync_finalize(kring); send_down |= (kring->nr_kflags & NR_FORWARD); /* host ring only */ if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) { microtime(&ring->ts); } found = kring->rcur != kring->rtail; nm_kr_put(kring); if (found) { revents |= want_rx; retry_rx = 0; kring->nm_notify(kring, 0); } } if (retry_rx && sr) { nm_os_selrecord(sr, check_all_rx ? &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si); } if (send_down > 0 || retry_rx) { retry_rx = 0; if (send_down) goto flush_tx; /* and retry_rx */ else goto do_retry_rx; } } /* * Transparent mode: marked bufs on rx rings between * kring->nr_hwcur and ring->head * are passed to the other endpoint. * * Transparent mode requires to bind all * rings to a single file descriptor. */ if (q.head && !nm_kr_tryget(&na->tx_rings[na->num_tx_rings], 1, &revents)) { netmap_send_up(na->ifp, &q); nm_kr_put(&na->tx_rings[na->num_tx_rings]); } return (revents); #undef want_tx #undef want_rx } /*-------------------- driver support routines -------------------*/ /* default notify callback */ static int netmap_notify(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; enum txrx t = kring->tx; nm_os_selwakeup(&kring->si); /* optimization: avoid a wake up on the global * queue if nobody has registered for more * than one ring */ if (na->si_users[t] > 0) nm_os_selwakeup(&na->si[t]); return NM_IRQ_COMPLETED; } #if 0 static int netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) { if (tx == NR_TX) { KeSetEvent(notes->TX_EVENT, 0, FALSE); } else { KeSetEvent(notes->RX_EVENT, 0, FALSE); } return 0; } #endif /* called by all routines that create netmap_adapters. * provide some defaults and get a reference to the * memory allocator */ int netmap_attach_common(struct netmap_adapter *na) { if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { D("%s: invalid rings tx %d rx %d", na->name, na->num_tx_rings, na->num_rx_rings); return EINVAL; } #ifdef __FreeBSD__ if (na->na_flags & NAF_HOST_RINGS && na->ifp) { na->if_input = na->ifp->if_input; /* for netmap_send_up */ } #endif /* __FreeBSD__ */ if (na->nm_krings_create == NULL) { /* we assume that we have been called by a driver, * since other port types all provide their own * nm_krings_create */ na->nm_krings_create = netmap_hw_krings_create; na->nm_krings_delete = netmap_hw_krings_delete; } if (na->nm_notify == NULL) na->nm_notify = netmap_notify; na->active_fds = 0; if (na->nm_mem == NULL) /* use the global allocator */ na->nm_mem = &nm_mem; netmap_mem_get(na->nm_mem); #ifdef WITH_VALE if (na->nm_bdg_attach == NULL) /* no special nm_bdg_attach callback. On VALE * attach, we need to interpose a bwrap */ na->nm_bdg_attach = netmap_bwrap_attach; #endif return 0; } /* standard cleanup, called by all destructors */ void netmap_detach_common(struct netmap_adapter *na) { if (na->tx_rings) { /* XXX should not happen */ D("freeing leftover tx_rings"); na->nm_krings_delete(na); } netmap_pipe_dealloc(na); if (na->nm_mem) netmap_mem_put(na->nm_mem); bzero(na, sizeof(*na)); free(na, M_DEVBUF); } /* Wrapper for the register callback provided netmap-enabled * hardware drivers. * nm_iszombie(na) means that the driver module has been * unloaded, so we cannot call into it. * nm_os_ifnet_lock() must guarantee mutual exclusion with * module unloading. */ static int netmap_hw_reg(struct netmap_adapter *na, int onoff) { struct netmap_hw_adapter *hwna = (struct netmap_hw_adapter*)na; int error = 0; nm_os_ifnet_lock(); if (nm_iszombie(na)) { if (onoff) { error = ENXIO; } else if (na != NULL) { na->na_flags &= ~NAF_NETMAP_ON; } goto out; } error = hwna->nm_hw_register(na, onoff); out: nm_os_ifnet_unlock(); return error; } static void netmap_hw_dtor(struct netmap_adapter *na) { if (nm_iszombie(na) || na->ifp == NULL) return; WNA(na->ifp) = NULL; } /* * Allocate a ``netmap_adapter`` object, and initialize it from the * 'arg' passed by the driver on attach. * We allocate a block of memory with room for a struct netmap_adapter * plus two sets of N+2 struct netmap_kring (where N is the number * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. */ static int _netmap_attach(struct netmap_adapter *arg, size_t size) { struct netmap_hw_adapter *hwna = NULL; struct ifnet *ifp = NULL; if (arg == NULL || arg->ifp == NULL) goto fail; ifp = arg->ifp; hwna = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); if (hwna == NULL) goto fail; hwna->up = *arg; hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE; strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); hwna->nm_hw_register = hwna->up.nm_register; hwna->up.nm_register = netmap_hw_reg; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; } netmap_adapter_get(&hwna->up); NM_ATTACH_NA(ifp, &hwna->up); #ifdef linux if (ifp->netdev_ops) { /* prepare a clone of the netdev ops */ #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else hwna->nm_ndo = *ifp->netdev_ops; #endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */ } hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; if (ifp->ethtool_ops) { hwna->nm_eto = *ifp->ethtool_ops; } hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS hwna->nm_eto.set_channels = linux_netmap_set_channels; #endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */ if (arg->nm_config == NULL) { hwna->up.nm_config = netmap_linux_config; } #endif /* linux */ if (arg->nm_dtor == NULL) { hwna->up.nm_dtor = netmap_hw_dtor; } if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n", hwna->up.num_tx_rings, hwna->up.num_tx_desc, hwna->up.num_rx_rings, hwna->up.num_rx_desc); return 0; fail: D("fail, arg %p ifp %p na %p", arg, ifp, hwna); return (hwna ? EINVAL : ENOMEM); } int netmap_attach(struct netmap_adapter *arg) { return _netmap_attach(arg, sizeof(struct netmap_hw_adapter)); } #ifdef WITH_PTNETMAP_GUEST int -netmap_pt_guest_attach(struct netmap_adapter *arg, - void *csb, - unsigned int nifp_offset, - nm_pt_guest_ptctl_t ptctl) +netmap_pt_guest_attach(struct netmap_adapter *arg, void *csb, + unsigned int nifp_offset, unsigned int memid) { struct netmap_pt_guest_adapter *ptna; struct ifnet *ifp = arg ? arg->ifp : NULL; int error; /* get allocator */ - arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, ptctl); + arg->nm_mem = netmap_mem_pt_guest_new(ifp, nifp_offset, memid); if (arg->nm_mem == NULL) return ENOMEM; arg->na_flags |= NAF_MEM_OWNER; error = _netmap_attach(arg, sizeof(struct netmap_pt_guest_adapter)); if (error) return error; /* get the netmap_pt_guest_adapter */ ptna = (struct netmap_pt_guest_adapter *) NA(ifp); ptna->csb = csb; /* Initialize a separate pass-through netmap adapter that is going to * be used by the ptnet driver only, and so never exposed to netmap * applications. We only need a subset of the available fields. */ memset(&ptna->dr, 0, sizeof(ptna->dr)); ptna->dr.up.ifp = ifp; ptna->dr.up.nm_mem = ptna->hwup.up.nm_mem; netmap_mem_get(ptna->dr.up.nm_mem); ptna->dr.up.nm_config = ptna->hwup.up.nm_config; ptna->backend_regifs = 0; return 0; } #endif /* WITH_PTNETMAP_GUEST */ void NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) { if (!na) { return; } refcount_acquire(&na->na_refcount); } /* returns 1 iff the netmap_adapter is destroyed */ int NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) { if (!na) return 1; if (!refcount_release(&na->na_refcount)) return 0; if (na->nm_dtor) na->nm_dtor(na); netmap_detach_common(na); return 1; } /* nm_krings_create callback for all hardware native adapters */ int netmap_hw_krings_create(struct netmap_adapter *na) { int ret = netmap_krings_create(na, 0); if (ret == 0) { /* initialize the mbq for the sw rx ring */ mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); ND("initialized sw rx queue %d", na->num_rx_rings); } return ret; } /* * Called on module unload by the netmap-enabled drivers */ void netmap_detach(struct ifnet *ifp) { struct netmap_adapter *na = NA(ifp); if (!na) return; NMG_LOCK(); netmap_set_all_rings(na, NM_KR_LOCKED); na->na_flags |= NAF_ZOMBIE; /* * if the netmap adapter is not native, somebody * changed it, so we can not release it here. * The NAF_ZOMBIE flag will notify the new owner that * the driver is gone. */ if (na->na_flags & NAF_NATIVE) { netmap_adapter_put(na); } /* give active users a chance to notice that NAF_ZOMBIE has been * turned on, so that they can stop and return an error to userspace. * Note that this becomes a NOP if there are no active users and, * therefore, the put() above has deleted the na, since now NA(ifp) is * NULL. */ netmap_enable_all_rings(ifp); NMG_UNLOCK(); } /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. * * We only store packets in a bounded mbq and then copy them * in the relevant rxsync routine. * * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, * we make sure to make the mode change visible here. */ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring, *tx_kring; u_int len = MBUF_LEN(m); u_int error = ENOBUFS; unsigned int txr; struct mbq *q; int space; kring = &na->rx_rings[na->num_rx_rings]; // XXX [Linux] we do not need this lock // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); if (!nm_netmap_on(na)) { D("%s not in netmap mode anymore", na->name); error = ENXIO; goto done; } txr = MBUF_TXQ(m); if (txr >= na->num_tx_rings) { txr %= na->num_tx_rings; } tx_kring = &NMR(na, NR_TX)[txr]; if (tx_kring->nr_mode == NKR_NETMAP_OFF) { return MBUF_TRANSMIT(na, ifp, m); } q = &kring->rx_queue; // XXX reconsider long packets if we handle fragments if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */ D("%s from_host, drop packet size %d > %d", na->name, len, NETMAP_BUF_SIZE(na)); goto done; } if (nm_os_mbuf_has_offld(m)) { RD(1, "%s drop mbuf requiring offloadings", na->name); goto done; } /* protect against rxsync_from_host(), netmap_sw_to_nic() * and maybe other instances of netmap_transmit (the latter * not possible on Linux). * Also avoid overflowing the queue. */ mbq_lock(q); space = kring->nr_hwtail - kring->nr_hwcur; if (space < 0) space += kring->nkr_num_slots; if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), len, m); } else { mbq_enqueue(q, m); ND(10, "%s %d bufs in queue len %d m %p", na->name, mbq_len(q), len, m); /* notify outside the lock */ m = NULL; error = 0; } mbq_unlock(q); done: if (m) m_freem(m); /* unconditionally wake up listeners */ kring->nm_notify(kring, 0); /* this is normally netmap_notify(), but for nics * connected to a bridge it is netmap_bwrap_intr_notify(), * that possibly forwards the frames through the switch */ return (error); } /* * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. * If native netmap mode is not set just return NULL. * If native netmap mode is set, in particular, we have to set nr_mode to * NKR_NETMAP_ON. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur) { struct netmap_kring *kring; int new_hwofs, lim; if (!nm_native_on(na)) { ND("interface not in native netmap mode"); return NULL; /* nothing to reinitialize */ } /* XXX note- in the new scheme, we are not guaranteed to be * under lock (e.g. when called on a device reset). * In this case, we should set a flag and do not trust too * much the values. In practice: TODO * - set a RESET flag somewhere in the kring * - do the processing in a conservative way * - let the *sync() fixup at the end. */ if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; if (kring->nr_pending_mode == NKR_NETMAP_OFF) { kring->nr_mode = NKR_NETMAP_OFF; return NULL; } // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; if (kring->nr_pending_mode == NKR_NETMAP_OFF) { kring->nr_mode = NKR_NETMAP_OFF; return NULL; } new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; /* Always set the new offset value and realign the ring. */ if (netmap_verbose) D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", na->name, tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, kring->nr_hwtail, tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; if (tx == NR_TX) { kring->nr_hwtail = kring->nr_hwcur + lim; if (kring->nr_hwtail > lim) kring->nr_hwtail -= lim + 1; } #if 0 // def linux /* XXX check that the mappings are correct */ /* need ring_nr, adapter->pdev, direction */ buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { D("error mapping rx netmap buffer %d", i); // XXX fix error handling } #endif /* linux */ /* * Wakeup on the individual and global selwait * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ kring->nr_mode = NKR_NETMAP_ON; kring->nm_notify(kring, 0); return kring->ring->slot; } /* * Dispatch rx/tx interrupts to the netmap rings. * * "work_done" is non-null on the RX path, NULL for the TX path. * We rely on the OS to make sure that there is only one active * instance per queue, and that there is appropriate locking. * * The 'notify' routine depends on what the ring is attached to. * - for a netmap file descriptor, do a selwakeup on the individual * waitqueue, plus one on the global one if needed * (see netmap_notify) * - for a nic connected to a switch, call the proper forwarding routine * (see netmap_bwrap_intr_notify) */ int netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done) { struct netmap_kring *kring; enum txrx t = (work_done ? NR_RX : NR_TX); q &= NETMAP_RING_MASK; if (netmap_verbose) { RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); } if (q >= nma_get_nrings(na, t)) return NM_IRQ_PASS; // not a physical queue kring = NMR(na, t) + q; if (kring->nr_mode == NKR_NETMAP_OFF) { return NM_IRQ_PASS; } if (t == NR_RX) { kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? *work_done = 1; /* do not fire napi again */ } return kring->nm_notify(kring, 0); } /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. * * If the card is not in netmap mode, simply return NM_IRQ_PASS, * so that the caller proceeds with regular processing. * Otherwise call netmap_common_irq(). * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one * if needed (multiqueue card _and_ there are multiqueue listeners), * and return NR_IRQ_COMPLETED. * * Finally, if called on rx from an interface connected to a switch, * calls the proper forwarding routine. */ int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { struct netmap_adapter *na = NA(ifp); /* * XXX emulated netmap mode sets NAF_SKIP_INTR so * we still use the regular driver even though the previous * check fails. It is unclear whether we should use * nm_native_on() here. */ if (!nm_netmap_on(na)) return NM_IRQ_PASS; if (na->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); return NM_IRQ_PASS; } return netmap_common_irq(na, q, work_done); } /* * Module loader and unloader * * netmap_init() creates the /dev/netmap device and initializes * all global variables. Returns 0 on success, errno on failure * (but there is no chance) * * netmap_fini() destroys everything. */ static struct cdev *netmap_dev; /* /dev/netmap character device. */ extern struct cdevsw netmap_cdevsw; void netmap_fini(void) { if (netmap_dev) destroy_dev(netmap_dev); /* we assume that there are no longer netmap users */ nm_os_ifnet_fini(); netmap_uninit_bridges(); netmap_mem_fini(); NMG_LOCK_DESTROY(); printf("netmap: unloaded module.\n"); } int netmap_init(void) { int error; NMG_LOCK_INIT(); error = netmap_mem_init(); if (error != 0) goto fail; /* * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls * when the module is compiled in. * XXX could use make_dev_credv() to get error number */ netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "netmap"); if (!netmap_dev) goto fail; error = netmap_init_bridges(); if (error) goto fail; #ifdef __FreeBSD__ nm_os_vi_init_index(); #endif error = nm_os_ifnet_init(); if (error) goto fail; printf("netmap: loaded module\n"); return (0); fail: netmap_fini(); return (EINVAL); /* may be incorrect */ } Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_freebsd.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_freebsd.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_freebsd.c (revision 308054) @@ -1,1498 +1,1468 @@ /* * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ #include /* DEV_MODULE_ORDERED */ #include #include /* kern_ioctl() */ #include #include /* vtophys */ #include /* vtophys */ #include #include #include #include #include #include #include /* sockaddrs */ #include #include /* kthread_add() */ #include /* PROC_LOCK() */ #include /* RFNOWAIT */ #include /* sched_bind() */ #include /* mp_maxid */ #include #include #include /* IFT_ETHER */ #include /* ether_ifdetach */ #include /* LLADDR */ #include /* bus_dmamap_* */ #include /* in6_cksum_pseudo() */ #include /* in_pseudo(), in_cksum_hdr() */ #include #include #include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ void nm_os_selinfo_init(NM_SELINFO_T *si) { struct mtx *m = &si->m; mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); knlist_init_mtx(&si->si.si_note, m); } void nm_os_selinfo_uninit(NM_SELINFO_T *si) { /* XXX kqueue(9) needed; these will mirror knlist_init. */ knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); knlist_destroy(&si->si.si_note); /* now we don't need the mutex anymore */ mtx_destroy(&si->m); } void nm_os_ifnet_lock(void) { IFNET_WLOCK(); } void nm_os_ifnet_unlock(void) { IFNET_WUNLOCK(); } static int netmap_use_count = 0; void nm_os_get_module(void) { netmap_use_count++; } void nm_os_put_module(void) { netmap_use_count--; } static void netmap_ifnet_arrival_handler(void *arg __unused, struct ifnet *ifp) { netmap_undo_zombie(ifp); } static void netmap_ifnet_departure_handler(void *arg __unused, struct ifnet *ifp) { netmap_make_zombie(ifp); } static eventhandler_tag nm_ifnet_ah_tag; static eventhandler_tag nm_ifnet_dh_tag; int nm_os_ifnet_init(void) { nm_ifnet_ah_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, netmap_ifnet_arrival_handler, NULL, EVENTHANDLER_PRI_ANY); nm_ifnet_dh_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, netmap_ifnet_departure_handler, NULL, EVENTHANDLER_PRI_ANY); return 0; } void nm_os_ifnet_fini(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nm_ifnet_ah_tag); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nm_ifnet_dh_tag); } rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; int nw = len / 2; int i; for (i = 0; i < nw; i++) cur_sum += be16toh(words[i]); if (len & 1) cur_sum += (data[len-1] << 8); return cur_sum; } /* Fold a raw checksum: 'cur_sum' is in host byte order, while the * return value is in network byte order. */ uint16_t nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); return htobe16((~cur_sum) & 0xFFFF); } uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET uint16_t pseudolen = datalen + iph->protocol; /* Compute and insert the pseudo-header cheksum. */ *check = in_pseudo(iph->saddr, iph->daddr, htobe16(pseudolen)); /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; D("inet4 segmentation not supported"); } #endif } void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; D("inet6 segmentation not supported"); } #endif } /* on FreeBSD we send up one packet at a time */ void * nm_os_send_up(struct ifnet *ifp, struct mbuf *m, struct mbuf *prev) { NA(ifp)->if_input(ifp, m); return NULL; } int nm_os_mbuf_has_offld(struct mbuf *m) { return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6 | CSUM_TSO); } static void freebsd_generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)NA(ifp); int stolen = generic_rx_handler(ifp, m); if (!stolen) { gna->save_if_input(ifp, m); } } /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = na->ifp; if (intercept) { if (gna->save_if_input) { D("cannot intercept again"); return EINVAL; /* already set */ } gna->save_if_input = ifp->if_input; ifp->if_input = freebsd_generic_rx_handler; } else { if (!gna->save_if_input){ D("cannot restore"); return EINVAL; /* not saved */ } ifp->if_input = gna->save_if_input; gna->save_if_input = NULL; } return 0; } /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; struct ifnet *ifp = netmap_generic_getifp(gna); if (intercept) { na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; } else { ifp->if_transmit = na->if_transmit; } return 0; } /* * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * * We should add a reference to the mbuf so the m_freem() at the end * of the transmission does not consume resources. * * On FreeBSD, and on multiqueue cards, we can force the queue using * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; * */ int nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; u_int len = a->len; struct ifnet *ifp = a->ifp; struct mbuf *m = a->m; #if __FreeBSD_version < 1100000 /* * Old FreeBSD versions. The mbuf has a cluster attached, * we need to copy from the cluster to the netmap buffer. */ if (MBUF_REFCNT(m) != 1) { D("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } if (m->m_ext.ext_size < len) { RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } bcopy(a->addr, m->m_data, len); #else /* __FreeBSD_version >= 1100000 */ /* New FreeBSD versions. Link the external storage to * the netmap buffer, so that no copy is necessary. */ m->m_ext.ext_buf = m->m_data = a->addr; m->m_ext.ext_size = len; #endif /* __FreeBSD_version >= 1100000 */ m->m_len = m->m_pkthdr.len = len; /* mbuf refcnt is not contended, no need to use atomic * (a memory barrier is enough). */ SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ ret = NA(ifp)->if_transmit(ifp, m); return ret ? -1 : 0; } #if __FreeBSD_version >= 1100005 struct netmap_adapter * netmap_getna(if_t ifp) { return (NA((struct ifnet *)ifp)); } #endif /* __FreeBSD_version >= 1100005 */ /* * The following two functions are empty until we have a generic * way to extract the info from the ifp */ int nm_os_generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) { D("called, in tx %d rx %d", *tx, *rx); return 0; } void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called, in txq %d rxq %d", *txq, *rxq); *txq = netmap_generic_rings; *rxq = netmap_generic_rings; } void nm_os_generic_set_features(struct netmap_generic_adapter *gna) { gna->rxsg = 1; /* Supported through m_copydata. */ gna->txqdisc = 0; /* Not supported. */ } void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; mit->mit_ring_idx = idx; mit->mit_na = na; } void nm_os_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } void nm_os_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } int nm_os_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; } void nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } static int nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) { return EINVAL; } static void nm_vi_start(struct ifnet *ifp) { panic("nm_vi_start() must not be called"); } /* * Index manager of persistent virtual interfaces. * It is used to decide the lowest byte of the MAC address. * We use the same algorithm with management of bridge port index. */ #define NM_VI_MAX 255 static struct { uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */ uint8_t active; struct mtx lock; } nm_vi_indices; void nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) nm_vi_indices.index[i] = i; nm_vi_indices.active = 0; mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF); } /* return -1 if no index available */ static int nm_vi_get_index(void) { int ret; mtx_lock(&nm_vi_indices.lock); ret = nm_vi_indices.active == NM_VI_MAX ? -1 : nm_vi_indices.index[nm_vi_indices.active++]; mtx_unlock(&nm_vi_indices.lock); return ret; } static void nm_vi_free_index(uint8_t val) { int i, lim; mtx_lock(&nm_vi_indices.lock); lim = nm_vi_indices.active; for (i = 0; i < lim; i++) { if (nm_vi_indices.index[i] == val) { /* swap index[lim-1] and j */ int tmp = nm_vi_indices.index[lim-1]; nm_vi_indices.index[lim-1] = val; nm_vi_indices.index[i] = tmp; nm_vi_indices.active--; break; } } if (lim == nm_vi_indices.active) D("funny, index %u didn't found", val); mtx_unlock(&nm_vi_indices.lock); } #undef NM_VI_MAX /* * Implementation of a netmap-capable virtual interface that * registered to the system. * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9. * * Note: Linux sets refcount to 0 on allocation of net_device, * then increments it on registration to the system. * FreeBSD sets refcount to 1 on if_alloc(), and does not * increment this refcount on if_attach(). */ int nm_os_vi_persist(const char *name, struct ifnet **ret) { struct ifnet *ifp; u_short macaddr_hi; uint32_t macaddr_mid; u_char eaddr[6]; int unit = nm_vi_get_index(); /* just to decide MAC address */ if (unit < 0) return EBUSY; /* * We use the same MAC address generation method with tap * except for the highest octet is 00:be instead of 00:bd */ macaddr_hi = htons(0x00be); /* XXX tap + 1 */ macaddr_mid = (uint32_t) ticks; bcopy(&macaddr_hi, eaddr, sizeof(short)); bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); eaddr[5] = (uint8_t)unit; ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { D("if_alloc failed"); return ENOMEM; } if_initname(ifp, name, IF_DUNIT_NONE); ifp->if_mtu = 65536; ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = (void *)nm_vi_dummy; ifp->if_ioctl = nm_vi_dummy; ifp->if_start = nm_vi_start; ifp->if_mtu = ETHERMTU; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; ether_ifattach(ifp, eaddr); *ret = ifp; return 0; } /* unregister from the system and drop the final refcount */ void nm_os_vi_detach(struct ifnet *ifp) { nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } /* ======================== PTNETMAP SUPPORT ========================== */ #ifdef WITH_PTNETMAP_GUEST #include #include #include /* bus_dmamap_* */ #include #include #include /* * ptnetmap memory device (memdev) for freebsd guest, * ssed to expose host netmap memory to the guest through a PCI BAR. */ /* * ptnetmap memdev private data structure */ struct ptnetmap_memdev { device_t dev; struct resource *pci_io; struct resource *pci_mem; struct netmap_mem_d *nm_mem; }; static int ptn_memdev_probe(device_t); static int ptn_memdev_attach(device_t); static int ptn_memdev_detach(device_t); static int ptn_memdev_shutdown(device_t); static device_method_t ptn_memdev_methods[] = { DEVMETHOD(device_probe, ptn_memdev_probe), DEVMETHOD(device_attach, ptn_memdev_attach), DEVMETHOD(device_detach, ptn_memdev_detach), DEVMETHOD(device_shutdown, ptn_memdev_shutdown), DEVMETHOD_END }; static driver_t ptn_memdev_driver = { PTNETMAP_MEMDEV_NAME, ptn_memdev_methods, sizeof(struct ptnetmap_memdev), }; /* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation * below. */ static devclass_t ptnetmap_devclass; DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, ptnetmap_devclass, NULL, NULL, SI_ORDER_MIDDLE + 1); /* - * I/O port read/write wrappers. - * Some are not used, so we keep them commented out until needed - */ -#define ptn_ioread16(ptn_dev, reg) bus_read_2((ptn_dev)->pci_io, (reg)) -#define ptn_ioread32(ptn_dev, reg) bus_read_4((ptn_dev)->pci_io, (reg)) -#if 0 -#define ptn_ioread8(ptn_dev, reg) bus_read_1((ptn_dev)->pci_io, (reg)) -#define ptn_iowrite8(ptn_dev, reg, val) bus_write_1((ptn_dev)->pci_io, (reg), (val)) -#define ptn_iowrite16(ptn_dev, reg, val) bus_write_2((ptn_dev)->pci_io, (reg), (val)) -#define ptn_iowrite32(ptn_dev, reg, val) bus_write_4((ptn_dev)->pci_io, (reg), (val)) -#endif /* unused */ - -/* * Map host netmap memory through PCI-BAR in the guest OS, * returning physical (nm_paddr) and virtual (nm_addr) addresses * of the netmap memory mapped in the guest. */ int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, - void **nm_addr) + void **nm_addr, uint64_t *mem_size) { - uint32_t mem_size; int rid; D("ptn_memdev_driver iomap"); rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); - mem_size = ptn_ioread32(ptn_dev, PTNETMAP_IO_PCI_MEMSIZE); + *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_HI); + *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_LO) | + (*mem_size << 32); /* map memory allocator */ ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, - &rid, 0, ~0, mem_size, RF_ACTIVE); + &rid, 0, ~0, *mem_size, RF_ACTIVE); if (ptn_dev->pci_mem == NULL) { *nm_paddr = 0; *nm_addr = 0; return ENOMEM; } *nm_paddr = rman_get_start(ptn_dev->pci_mem); *nm_addr = rman_get_virtual(ptn_dev->pci_mem); - D("=== BAR %d start %lx len %lx mem_size %x ===", + D("=== BAR %d start %lx len %lx mem_size %lx ===", PTNETMAP_MEM_PCI_BAR, (unsigned long)(*nm_paddr), (unsigned long)rman_get_size(ptn_dev->pci_mem), - mem_size); + (unsigned long)*mem_size); return (0); } +uint32_t +nm_os_pt_memdev_ioread(struct ptnetmap_memdev *ptn_dev, unsigned int reg) +{ + return bus_read_4(ptn_dev->pci_io, reg); +} + /* Unmap host netmap memory. */ void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) { D("ptn_memdev_driver iounmap"); if (ptn_dev->pci_mem) { bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } } /* Device identification routine, return BUS_PROBE_DEFAULT on success, * positive on failure */ static int ptn_memdev_probe(device_t dev) { char desc[256]; if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) return (ENXIO); if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) return (ENXIO); snprintf(desc, sizeof(desc), "%s PCI adapter", PTNETMAP_MEMDEV_NAME); device_set_desc_copy(dev, desc); return (BUS_PROBE_DEFAULT); } /* Device initialization routine. */ static int ptn_memdev_attach(device_t dev) { struct ptnetmap_memdev *ptn_dev; int rid; uint16_t mem_id; D("ptn_memdev_driver attach"); ptn_dev = device_get_softc(dev); ptn_dev->dev = dev; pci_enable_busmaster(dev); rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (ptn_dev->pci_io == NULL) { device_printf(dev, "cannot map I/O space\n"); return (ENXIO); } - mem_id = ptn_ioread16(ptn_dev, PTNETMAP_IO_PCI_HOSTID); + mem_id = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMID); /* create guest allocator */ ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); if (ptn_dev->nm_mem == NULL) { ptn_memdev_detach(dev); return (ENOMEM); } netmap_mem_get(ptn_dev->nm_mem); - D("ptn_memdev_driver probe OK - host_id: %d", mem_id); + D("ptn_memdev_driver probe OK - host_mem_id: %d", mem_id); return (0); } /* Device removal routine. */ static int ptn_memdev_detach(device_t dev) { struct ptnetmap_memdev *ptn_dev; D("ptn_memdev_driver detach"); ptn_dev = device_get_softc(dev); if (ptn_dev->nm_mem) { netmap_mem_put(ptn_dev->nm_mem); ptn_dev->nm_mem = NULL; } if (ptn_dev->pci_mem) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } if (ptn_dev->pci_io) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); ptn_dev->pci_io = NULL; } return (0); } static int ptn_memdev_shutdown(device_t dev) { D("ptn_memdev_driver shutdown"); return bus_generic_shutdown(dev); } #endif /* WITH_PTNETMAP_GUEST */ /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and * destructor. */ struct netmap_vm_handle_t { struct cdev *dev; struct netmap_priv_d *priv; }; static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; if (netmap_verbose) D("handle %p size %jd prot %d foff %jd", handle, (intmax_t)size, prot, (intmax_t)foff); if (color) *color = 0; dev_ref(vmh->dev); return 0; } static void netmap_dev_pager_dtor(void *handle) { struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; if (netmap_verbose) D("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); } static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct netmap_vm_handle_t *vmh = object->handle; struct netmap_priv_d *priv = vmh->priv; struct netmap_adapter *na = priv->np_na; vm_paddr_t paddr; vm_page_t page; vm_memattr_t memattr; vm_pindex_t pidx; ND("object %p offset %jd prot %d mres %p", object, (intmax_t)offset, prot, mres); memattr = object->memattr; pidx = OFF_TO_IDX(offset); paddr = netmap_mem_ofstophys(na->nm_mem, offset); if (paddr == 0) return VM_PAGER_FAIL; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake page, update it with * the new physical address. */ page = *mres; vm_page_updatefake(page, paddr, memattr); } else { /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK #define VM_OBJECT_WLOCK VM_OBJECT_LOCK #endif /* VM_OBJECT_WUNLOCK */ VM_OBJECT_WUNLOCK(object); page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); vm_page_lock(*mres); vm_page_free(*mres); vm_page_unlock(*mres); *mres = page; vm_page_insert(page, object, pidx); } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); } static struct cdev_pager_ops netmap_cdev_pager_ops = { .cdev_pg_ctor = netmap_dev_pager_ctor, .cdev_pg_dtor = netmap_dev_pager_dtor, .cdev_pg_fault = netmap_dev_pager_fault, }; static int netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, vm_size_t objsize, vm_object_t *objp, int prot) { int error; struct netmap_vm_handle_t *vmh; struct netmap_priv_d *priv; vm_object_t obj; if (netmap_verbose) D("cdev %p foff %jd size %jd objp %p prot %d", cdev, (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (vmh == NULL) return ENOMEM; vmh->dev = cdev; NMG_LOCK(); error = devfs_get_cdevpriv((void**)&priv); if (error) goto err_unlock; if (priv->np_nifp == NULL) { error = EINVAL; goto err_unlock; } vmh->priv = priv; priv->np_refs++; NMG_UNLOCK(); obj = cdev_pager_allocate(vmh, OBJT_DEVICE, &netmap_cdev_pager_ops, objsize, prot, *foff, NULL); if (obj == NULL) { D("cdev_pager_allocate failed"); error = EINVAL; goto err_deref; } *objp = obj; return 0; err_deref: NMG_LOCK(); priv->np_refs--; err_unlock: NMG_UNLOCK(); // err: free(vmh, M_DEVBUF); return error; } /* * On FreeBSD the close routine is only called on the last close on * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to * netmap_dtor() is called when the process has no open fds and no active * memory maps on /dev/netmap, as in linux. */ static int netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { if (netmap_verbose) D("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td); return 0; } static int netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct netmap_priv_d *priv; int error; (void)dev; (void)oflags; (void)devtype; (void)td; NMG_LOCK(); priv = netmap_priv_new(); if (priv == NULL) { error = ENOMEM; goto out; } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { netmap_priv_delete(priv); } out: NMG_UNLOCK(); return error; } /******************** kthread wrapper ****************/ #include u_int nm_os_ncpus(void) { return mp_maxid + 1; } struct nm_kthread_ctx { struct thread *user_td; /* thread user-space (kthread creator) to send ioctl */ - /* notification to guest (interrupt) */ - int irq_fd; /* ioctl fd */ - struct nm_kth_ioctl irq_ioctl; /* ioctl arguments */ + struct ptnetmap_cfgentry_bhyve cfg; - /* notification from guest */ - void *ioevent_file; /* tsleep() argument */ - /* worker function and parameter */ nm_kthread_worker_fn_t worker_fn; void *worker_private; struct nm_kthread *nmk; /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ long type; }; struct nm_kthread { struct thread *worker; struct mtx worker_lock; uint64_t scheduled; /* pending wake_up request */ struct nm_kthread_ctx worker_ctx; int run; /* used to stop kthread */ int attach_user; /* kthread attached to user_process */ int affinity; }; void inline nm_os_kthread_wakeup_worker(struct nm_kthread *nmk) { /* * There may be a race between FE and BE, * which call both this function, and worker kthread, * that reads nmk->scheduled. * * For us it is not important the counter value, * but simply that it has changed since the last * time the kthread saw it. */ mtx_lock(&nmk->worker_lock); nmk->scheduled++; - if (nmk->worker_ctx.ioevent_file) { - wakeup(nmk->worker_ctx.ioevent_file); + if (nmk->worker_ctx.cfg.wchan) { + wakeup((void *)nmk->worker_ctx.cfg.wchan); } mtx_unlock(&nmk->worker_lock); } void inline nm_os_kthread_send_irq(struct nm_kthread *nmk) { struct nm_kthread_ctx *ctx = &nmk->worker_ctx; int err; - if (ctx->user_td && ctx->irq_fd > 0) { - err = kern_ioctl(ctx->user_td, ctx->irq_fd, ctx->irq_ioctl.com, (caddr_t)&ctx->irq_ioctl.data.msix); + if (ctx->user_td && ctx->cfg.ioctl_fd > 0) { + err = kern_ioctl(ctx->user_td, ctx->cfg.ioctl_fd, ctx->cfg.ioctl_cmd, + (caddr_t)&ctx->cfg.ioctl_data); if (err) { - D("kern_ioctl error: %d ioctl parameters: fd %d com %ju data %p", - err, ctx->irq_fd, (uintmax_t)ctx->irq_ioctl.com, &ctx->irq_ioctl.data); + D("kern_ioctl error: %d ioctl parameters: fd %d com %lu data %p", + err, ctx->cfg.ioctl_fd, (unsigned long)ctx->cfg.ioctl_cmd, + &ctx->cfg.ioctl_data); } } } static void nm_kthread_worker(void *data) { struct nm_kthread *nmk = data; struct nm_kthread_ctx *ctx = &nmk->worker_ctx; uint64_t old_scheduled = nmk->scheduled; if (nmk->affinity >= 0) { thread_lock(curthread); sched_bind(curthread, nmk->affinity); thread_unlock(curthread); } while (nmk->run) { /* * check if the parent process dies * (when kthread is attached to user process) */ if (ctx->user_td) { PROC_LOCK(curproc); thread_suspend_check(0); PROC_UNLOCK(curproc); } else { kthread_suspend_check(); } /* - * if ioevent_file is not defined, we don't have notification + * if wchan is not defined, we don't have notification * mechanism and we continually execute worker_fn() */ - if (!ctx->ioevent_file) { + if (!ctx->cfg.wchan) { ctx->worker_fn(ctx->worker_private); /* worker body */ } else { /* checks if there is a pending notification */ mtx_lock(&nmk->worker_lock); if (likely(nmk->scheduled != old_scheduled)) { old_scheduled = nmk->scheduled; mtx_unlock(&nmk->worker_lock); ctx->worker_fn(ctx->worker_private); /* worker body */ continue; } else if (nmk->run) { /* wait on event with one second timeout */ - msleep_spin(ctx->ioevent_file, &nmk->worker_lock, + msleep_spin((void *)ctx->cfg.wchan, &nmk->worker_lock, "nmk_ev", hz); nmk->scheduled++; } mtx_unlock(&nmk->worker_lock); } } kthread_exit(); } -static int -nm_kthread_open_files(struct nm_kthread *nmk, struct nm_kthread_cfg *cfg) -{ - /* send irq through ioctl to bhyve (vmm.ko) */ - if (cfg->event.irqfd) { - nmk->worker_ctx.irq_fd = cfg->event.irqfd; - nmk->worker_ctx.irq_ioctl = cfg->event.ioctl; - } - /* ring.ioeventfd contains the chan where do tsleep to wait events */ - if (cfg->event.ioeventfd) { - nmk->worker_ctx.ioevent_file = (void *)cfg->event.ioeventfd; - } - - return 0; -} - -static void -nm_kthread_close_files(struct nm_kthread *nmk) -{ - nmk->worker_ctx.irq_fd = 0; - nmk->worker_ctx.ioevent_file = NULL; -} - void nm_os_kthread_set_affinity(struct nm_kthread *nmk, int affinity) { nmk->affinity = affinity; } struct nm_kthread * -nm_os_kthread_create(struct nm_kthread_cfg *cfg) +nm_os_kthread_create(struct nm_kthread_cfg *cfg, unsigned int cfgtype, + void *opaque) { struct nm_kthread *nmk = NULL; - int error; + if (cfgtype != PTNETMAP_CFGTYPE_BHYVE) { + D("Unsupported cfgtype %u", cfgtype); + return NULL; + } + nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); if (!nmk) return NULL; mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_SPIN); nmk->worker_ctx.worker_fn = cfg->worker_fn; nmk->worker_ctx.worker_private = cfg->worker_private; nmk->worker_ctx.type = cfg->type; nmk->affinity = -1; /* attach kthread to user process (ptnetmap) */ nmk->attach_user = cfg->attach_user; - /* open event fd */ - error = nm_kthread_open_files(nmk, cfg); - if (error) - goto err; + /* store kick/interrupt configuration */ + if (opaque) { + nmk->worker_ctx.cfg = *((struct ptnetmap_cfgentry_bhyve *)opaque); + } return nmk; -err: - free(nmk, M_DEVBUF); - return NULL; } int nm_os_kthread_start(struct nm_kthread *nmk) { struct proc *p = NULL; int error = 0; if (nmk->worker) { return EBUSY; } /* check if we want to attach kthread to user process */ if (nmk->attach_user) { nmk->worker_ctx.user_td = curthread; p = curthread->td_proc; } /* enable kthread main loop */ nmk->run = 1; /* create kthread */ if((error = kthread_add(nm_kthread_worker, nmk, p, &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", nmk->worker_ctx.type))) { goto err; } - D("nm_kthread started td 0x%p", nmk->worker); + D("nm_kthread started td %p", nmk->worker); return 0; err: D("nm_kthread start failed err %d", error); nmk->worker = NULL; return error; } void nm_os_kthread_stop(struct nm_kthread *nmk) { if (!nmk->worker) { return; } /* tell to kthread to exit from main loop */ nmk->run = 0; /* wake up kthread if it sleeps */ kthread_resume(nmk->worker); nm_os_kthread_wakeup_worker(nmk); nmk->worker = NULL; } void nm_os_kthread_delete(struct nm_kthread *nmk) { if (!nmk) return; if (nmk->worker) { nm_os_kthread_stop(nmk); } - nm_kthread_close_files(nmk); + memset(&nmk->worker_ctx.cfg, 0, sizeof(nmk->worker_ctx.cfg)); free(nmk, M_DEVBUF); } /******************** kqueue support ****************/ /* * nm_os_selwakeup also needs to issue a KNOTE_UNLOCKED. * We use a non-zero argument to distinguish the call from the one * in kevent_scan() which instead also needs to run netmap_poll(). * The knote uses a global mutex for the time being. We might * try to reuse the one in the si, but it is not allocated * permanently so it might be a bit tricky. * * The *kqfilter function registers one or another f_event * depending on read or write mode. * In the call to f_event() td_fpop is NULL so any child function * calling devfs_get_cdevpriv() would fail - and we need it in * netmap_poll(). As a workaround we store priv into kn->kn_hook * and pass it as first argument to netmap_poll(), which then * uses the failure to tell that we are called from f_event() * and do not need the selrecord(). */ void nm_os_selwakeup(struct nm_selinfo *si) { if (netmap_verbose) D("on knote %p", &si->si.si_note); selwakeuppri(&si->si, PI_NET); /* use a non-zero hint to tell the notification from the * call done in kqueue_scan() which uses 0 */ KNOTE_UNLOCKED(&si->si.si_note, 0x100 /* notification */); } void nm_os_selrecord(struct thread *td, struct nm_selinfo *si) { selrecord(td, &si->si); } static void netmap_knrdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct selinfo *si = &priv->np_si[NR_RX]->si; D("remove selinfo %p", si); knlist_remove(&si->si_note, kn, 0); } static void netmap_knwdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct selinfo *si = &priv->np_si[NR_TX]->si; D("remove selinfo %p", si); knlist_remove(&si->si_note, kn, 0); } /* * callback from notifies (generated externally) and our * calls to kevent(). The former we just return 1 (ready) * since we do not know better. * In the latter we call netmap_poll and return 0/1 accordingly. */ static int netmap_knrw(struct knote *kn, long hint, int events) { struct netmap_priv_d *priv; int revents; if (hint != 0) { ND(5, "call from notify"); return 1; /* assume we are ready */ } priv = kn->kn_hook; /* the notification may come from an external thread, * in which case we do not want to run the netmap_poll * This should be filtered above, but check just in case. */ if (curthread != priv->np_td) { /* should not happen */ RD(5, "curthread changed %p %p", curthread, priv->np_td); return 1; } else { revents = netmap_poll(priv, events, NULL); return (events & revents) ? 1 : 0; } } static int netmap_knread(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLIN); } static int netmap_knwrite(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLOUT); } static struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, }; static struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, }; /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). * The 'priv' should be the same as in the netmap device. */ static int netmap_kqfilter(struct cdev *dev, struct knote *kn) { struct netmap_priv_d *priv; int error; struct netmap_adapter *na; struct nm_selinfo *si; int ev = kn->kn_filter; if (ev != EVFILT_READ && ev != EVFILT_WRITE) { D("bad filter request %d", ev); return 1; } error = devfs_get_cdevpriv((void**)&priv); if (error) { D("device not yet setup"); return 1; } na = priv->np_na; if (na == NULL) { D("no netmap adapter for this file descriptor"); return 1; } /* the si is indicated in the priv */ si = priv->np_si[(ev == EVFILT_WRITE) ? NR_TX : NR_RX]; // XXX lock(priv) ? kn->kn_fop = (ev == EVFILT_WRITE) ? &netmap_wfiltops : &netmap_rfiltops; kn->kn_hook = priv; knlist_add(&si->si.si_note, kn, 1); // XXX unlock(priv) ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", na, na->ifp->if_xname, curthread, priv, kn, priv->np_nifp, kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); return 0; } static int freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) { struct netmap_priv_d *priv; if (devfs_get_cdevpriv((void **)&priv)) { return POLLERR; } return netmap_poll(priv, events, td); } static int freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int ffla __unused, struct thread *td) { int error; struct netmap_priv_d *priv; CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); if (error) { /* XXX ENOENT should be impossible, since the priv * is now created in the open */ if (error == ENOENT) error = ENXIO; goto out; } error = netmap_ioctl(priv, cmd, data, td); out: CURVNET_RESTORE(); return error; } extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, .d_ioctl = freebsd_netmap_ioctl, .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; /*--- end of kqueue support ----*/ /* * Kernel entry point. * * Initialize/finalize the module and return. * * Return 0 on success, errno on failure. */ static int netmap_loader(__unused struct module *module, int event, __unused void *arg) { int error = 0; switch (event) { case MOD_LOAD: error = netmap_init(); break; case MOD_UNLOAD: /* * if some one is still using netmap, * then the module can not be unloaded. */ if (netmap_use_count) { D("netmap module can not be unloaded - netmap_use_count: %d", netmap_use_count); error = EBUSY; break; } netmap_fini(); break; default: error = EOPNOTSUPP; break; } return (error); } #ifdef DEV_MODULE_ORDERED /* * The netmap module contains three drivers: (i) the netmap character device * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI * device driver. The attach() routines of both (ii) and (iii) need the * lock of the global allocator, and such lock is initialized in netmap_init(), * which is part of (i). * Therefore, we make sure that (i) is loaded before (ii) and (iii), using * the 'order' parameter of driver declaration macros. For (i), we specify * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED * macros for (ii) and (iii). */ DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); #else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); #endif /* DEV_MODULE_ORDERED */ MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); /* reduce conditional code */ // linux API, use for the knlist in FreeBSD /* use a private mutex for the knlist */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_kern.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_kern.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_kern.h (revision 308054) @@ -1,2091 +1,2090 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo * Copyright (C) 2013-2016 Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. */ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ #if defined(linux) #if defined(CONFIG_NETMAP_VALE) #define WITH_VALE #endif #if defined(CONFIG_NETMAP_PIPE) #define WITH_PIPES #endif #if defined(CONFIG_NETMAP_MONITOR) #define WITH_MONITOR #endif #if defined(CONFIG_NETMAP_GENERIC) #define WITH_GENERIC #endif #if defined(CONFIG_NETMAP_PTNETMAP_GUEST) #define WITH_PTNETMAP_GUEST #endif #if defined(CONFIG_NETMAP_PTNETMAP_HOST) #define WITH_PTNETMAP_HOST #endif #elif defined (_WIN32) #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #else /* neither linux nor windows */ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES #define WITH_MONITOR #define WITH_GENERIC #define WITH_PTNETMAP_HOST /* ptnetmap host support */ #define WITH_PTNETMAP_GUEST /* ptnetmap guest support */ #endif #if defined(__FreeBSD__) #include #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define __user #define NM_LOCK_T struct mtx /* low level spinlock, used to protect queues */ #define NM_MTX_T struct sx /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) sx_init(&(m), #m) #define NM_MTX_DESTROY(m) sx_destroy(&(m)) #define NM_MTX_LOCK(m) sx_xlock(&(m)) #define NM_MTX_UNLOCK(m) sx_xunlock(&(m)) #define NM_MTX_ASSERT(m) sx_assert(&(m), SA_XLOCKED) #define NM_SELINFO_T struct nm_selinfo #define NM_SELRECORD_T struct thread #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define MBUF_TRANSMIT(na, ifp, m) ((na)->if_transmit(ifp, m)) #define GEN_TX_MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_ATOMIC_T volatile int // XXX ? /* atomic operations */ #include #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) #if __FreeBSD_version >= 1100030 #define WNA(_ifp) (_ifp)->if_netmap #else /* older FreeBSD */ #define WNA(_ifp) (_ifp)->if_pspare[0] #endif /* older FreeBSD */ #if __FreeBSD_version >= 1100005 struct netmap_adapter *netmap_getna(if_t ifp); #endif #if __FreeBSD_version >= 1100027 #define MBUF_REFCNT(m) ((m)->m_ext.ext_count) #define SET_MBUF_REFCNT(m, x) (m)->m_ext.ext_count = x #else #define MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) #define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x #endif #define MBUF_QUEUED(m) 1 struct nm_selinfo { struct selinfo si; struct mtx m; }; // XXX linux struct, not used in FreeBSD struct net_device_ops { }; struct ethtool_ops { }; struct hrtimer { }; #define NM_BNS_GET(b) #define NM_BNS_PUT(b) #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define MBUF_TRANSMIT(na, ifp, m) \ ({ \ /* Avoid infinite recursion with generic. */ \ m->priority = NM_MAGIC_PRIORITY_TX; \ (((struct net_device_ops *)(na)->if_transmit)->ndo_start_xmit(m, ifp)); \ 0; \ }) /* See explanation in nm_os_generic_xmit_frame. */ #define GEN_TX_MBUF_IFP(m) ((struct ifnet *)skb_shinfo(m)->destructor_arg) #define NM_ATOMIC_T volatile long unsigned int #define NM_MTX_T struct mutex /* OS-specific sleepable lock */ #define NM_MTX_INIT(m) mutex_init(&(m)) #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) mutex_lock(&(m)) #define NM_MTX_UNLOCK(m) mutex_unlock(&(m)) #define NM_MTX_ASSERT(m) mutex_is_locked(&(m)) #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ #elif defined (__APPLE__) #warning apple support is incomplete. #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define NM_LOCK_T IOLock * #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #elif defined (_WIN32) #include "../../../WINDOWS/win_glue.h" #define NM_SELRECORD_T IO_STACK_LOCATION #define NM_SELINFO_T win_SELINFO // see win_glue.h #define NM_LOCK_T win_spinlock_t // see win_glue.h #define NM_MTX_T KGUARDED_MUTEX /* OS-specific mutex (sleepable) */ #define NM_MTX_INIT(m) KeInitializeGuardedMutex(&m); #define NM_MTX_DESTROY(m) do { (void)(m); } while (0) #define NM_MTX_LOCK(m) KeAcquireGuardedMutex(&(m)) #define NM_MTX_UNLOCK(m) KeReleaseGuardedMutex(&(m)) #define NM_MTX_ASSERT(m) assert(&m.Count>0) //These linknames are for the NDIS driver #define NETMAP_NDIS_LINKNAME_STRING L"\\DosDevices\\NMAPNDIS" #define NETMAP_NDIS_NTDEVICE_STRING L"\\Device\\NMAPNDIS" //Definition of internal driver-to-driver ioctl codes #define NETMAP_KERNEL_XCHANGE_POINTERS _IO('i', 180) #define NETMAP_KERNEL_SEND_SHUTDOWN_SIGNAL _IO_direct('i', 195) //Empty data structures are not permitted by MSVC compiler //XXX_ale, try to solve this problem struct net_device_ops{ char data[1]; }; typedef struct ethtool_ops{ char data[1]; }; typedef struct hrtimer{ KTIMER timer; BOOLEAN active; KDPC deferred_proc; }; /* MSVC does not have likely/unlikely support */ #ifdef _MSC_VER #define likely(x) (x) #define unlikely(x) (x) #else #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #endif //_MSC_VER #else #error unsupported platform #endif /* end - platform-specific code */ #ifndef _WIN32 /* support for emulated sysctl */ #define SYSBEGIN(x) #define SYSEND #endif /* _WIN32 */ #define NM_ACCESS_ONCE(x) (*(volatile __typeof__(x) *)&(x)) #define NMG_LOCK_T NM_MTX_T #define NMG_LOCK_INIT() NM_MTX_INIT(netmap_global_lock) #define NMG_LOCK_DESTROY() NM_MTX_DESTROY(netmap_global_lock) #define NMG_LOCK() NM_MTX_LOCK(netmap_global_lock) #define NMG_UNLOCK() NM_MTX_UNLOCK(netmap_global_lock) #define NMG_LOCK_ASSERT() NM_MTX_ASSERT(netmap_global_lock) #define ND(format, ...) #define D(format, ...) \ do { \ struct timeval __xxts; \ microtime(&__xxts); \ printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ #define RD(lps, format, ...) \ do { \ static int t0, __cnt; \ if (t0 != time_second) { \ t0 = time_second; \ __cnt = 0; \ } \ if (__cnt++ < lps) \ D(format, ##__VA_ARGS__); \ } while (0) struct netmap_adapter; struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; /* os-specific NM_SELINFO_T initialzation/destruction functions */ void nm_os_selinfo_init(NM_SELINFO_T *); void nm_os_selinfo_uninit(NM_SELINFO_T *); const char *nm_dump_buf(char *p, int len, int lim, char *dst); void nm_os_selwakeup(NM_SELINFO_T *si); void nm_os_selrecord(NM_SELRECORD_T *sr, NM_SELINFO_T *si); int nm_os_ifnet_init(void); void nm_os_ifnet_fini(void); void nm_os_ifnet_lock(void); void nm_os_ifnet_unlock(void); void nm_os_get_module(void); void nm_os_put_module(void); void netmap_make_zombie(struct ifnet *); void netmap_undo_zombie(struct ifnet *); /* passes a packet up to the host stack. * If the packet is sent (or dropped) immediately it returns NULL, * otherwise it links the packet to prev and returns m. * In this case, a final call with m=NULL and prev != NULL will send up * the entire chain to the host stack. */ void *nm_os_send_up(struct ifnet *, struct mbuf *m, struct mbuf *prev); int nm_os_mbuf_has_offld(struct mbuf *m); #include "netmap_mbq.h" extern NMG_LOCK_T netmap_global_lock; enum txrx { NR_RX = 0, NR_TX = 1, NR_TXRX }; static __inline const char* nm_txrx2str(enum txrx t) { return (t== NR_RX ? "RX" : "TX"); } static __inline enum txrx nm_txrx_swap(enum txrx t) { return (t== NR_RX ? NR_TX : NR_RX); } #define for_rx_tx(t) for ((t) = 0; (t) < NR_TXRX; (t)++) /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. * It corresponds to ring->head * at the time the system call returns. * * nr_hwtail index of the first buffer owned by the kernel. * On RX, hwcur->hwtail are receive buffers * not yet released. hwcur is advanced following * ring->head, hwtail is advanced on incoming packets, * and a wakeup is generated when hwtail passes ring->cur * On TX, hwcur->rcur have been filled by the sender * but not sent yet to the NIC; rcur->hwtail are available * for new transmissions, and hwtail->hwcur-1 are pending * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * * For received packets, slot->flags is set to nkr_slot_flags * so we can provide a proper initial value (e.g. set NS_FORWARD * when operating in 'transparent' mode). * * The following fields are used to implement lock-free copy of packets * from input to output ports in VALE switch: * nkr_hwlease buffer after the last one being copied. * A writer in nm_bdg_flush reserves N buffers * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. * * Concurrent rxsync or txsync on the same ring are prevented through * by nm_kr_(try)lock() which in turn uses nr_busy. This is all we need * for NIC rings, and for TX rings attached to the host stack. * * RX rings attached to the host stack use an mbq (rx_queue) on both * rxsync_from_host() and netmap_transmit(). The mbq is protected * by its internal lock. * * RX rings attached to the VALE switch are accessed by both senders * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { struct netmap_ring *ring; uint32_t nr_hwcur; uint32_t nr_hwtail; /* * Copies of values in user rings, so we do not need to look * at the ring (which could be modified). These are set in the * *sync_prologue()/finalize() routines. */ uint32_t rhead; uint32_t rcur; uint32_t rtail; uint32_t nr_kflags; /* private driver flags */ #define NKR_PENDINTR 0x1 // Pending interrupt. #define NKR_EXCLUSIVE 0x2 /* exclusive binding */ #define NKR_FORWARD 0x4 /* (host ring only) there are packets to forward */ #define NKR_NEEDRING 0x8 /* ring needed even if users==0 * (used internally by pipes and * by ptnetmap host ports) */ uint32_t nr_mode; uint32_t nr_pending_mode; #define NKR_NETMAP_OFF 0x0 #define NKR_NETMAP_ON 0x1 uint32_t nkr_num_slots; /* * On a NIC reset, the NIC ring indexes may be reset but the * indexes in the netmap rings remain the same. nkr_hwofs * keeps track of the offset between the two. */ int32_t nkr_hwofs; uint16_t nkr_slot_flags; /* initial value for flags */ /* last_reclaim is opaque marker to help reduce the frequency * of operations such as reclaiming tx buffers. A possible use * is set it to ticks and do the reclaim only once per tick. */ uint64_t last_reclaim; NM_SELINFO_T si; /* poll/select wait queue */ NM_LOCK_T q_lock; /* protects kring and ring. */ NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ struct netmap_adapter *na; /* The following fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; uint32_t *nkr_leases; #define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ uint32_t nkr_hwlease; uint32_t nkr_lease_idx; /* while nkr_stopped is set, no new [tr]xsync operations can * be started on this kring. * This is used by netmap_disable_all_rings() * to find a synchronization point where critical data * structures pointed to by the kring can be added or removed */ volatile int nkr_stopped; /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers * (same size as the netmap ring), on rx rings we * store incoming mbufs in a queue that is drained by * a rxsync. */ struct mbuf **tx_pool; struct mbuf *tx_event; /* TX event used as a notification */ NM_LOCK_T tx_event_lock; /* protects the tx_event mbuf */ struct mbq rx_queue; /* intercepted rx mbufs. */ uint32_t users; /* existing bindings for this ring */ uint32_t ring_id; /* kring identifier */ enum txrx tx; /* kind of ring (tx or rx) */ char name[64]; /* diagnostic */ /* [tx]sync callback for this kring. * The default nm_kring_create callback (netmap_krings_create) * sets the nm_sync callback of each hardware tx(rx) kring to * the corresponding nm_txsync(nm_rxsync) taken from the * netmap_adapter; moreover, it sets the sync callback * of the host tx(rx) ring to netmap_txsync_to_host * (netmap_rxsync_from_host). * * Overrides: the above configuration is not changed by * any of the nm_krings_create callbacks. */ int (*nm_sync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #ifdef WITH_PIPES struct netmap_kring *pipe; /* if this is a pipe ring, * pointer to the other end */ #endif /* WITH_PIPES */ #ifdef WITH_VALE int (*save_notify)(struct netmap_kring *kring, int flags); #endif #ifdef WITH_MONITOR /* array of krings that are monitoring this kring */ struct netmap_kring **monitors; uint32_t max_monitors; /* current size of the monitors array */ uint32_t n_monitors; /* next unused entry in the monitor array */ /* * Monitors work by intercepting the sync and notify callbacks of the * monitored krings. This is implemented by replacing the pointers * above and saving the previous ones in mon_* pointers below */ int (*mon_sync)(struct netmap_kring *kring, int flags); int (*mon_notify)(struct netmap_kring *kring, int flags); uint32_t mon_tail; /* last seen slot on rx */ uint32_t mon_pos; /* index of this ring in the monitored ring array */ #endif } #ifdef _WIN32 __declspec(align(64)); #else __attribute__((__aligned__(64))); #endif /* return 1 iff the kring needs to be turned on */ static inline int nm_kring_pending_on(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_ON && kring->nr_mode == NKR_NETMAP_OFF; } /* return 1 iff the kring needs to be turned off */ static inline int nm_kring_pending_off(struct netmap_kring *kring) { return kring->nr_pending_mode == NKR_NETMAP_OFF && kring->nr_mode == NKR_NETMAP_ON; } /* return the next index, with wraparound */ static inline uint32_t nm_next(uint32_t i, uint32_t lim) { return unlikely (i == lim) ? 0 : i + 1; } /* return the previous index, with wraparound */ static inline uint32_t nm_prev(uint32_t i, uint32_t lim) { return unlikely (i == 0) ? lim : i - 1; } /* * * Here is the layout for the Rx and Tx rings. RxRING TxRING +-----------------+ +-----------------+ | | | | |XXX free slot XXX| |XXX free slot XXX| +-----------------+ +-----------------+ head->| owned by user |<-hwcur | not sent to nic |<-hwcur | | | yet | +-----------------+ | | cur->| available to | | | | user, not read | +-----------------+ | yet | cur->| (being | | | | prepared) | | | | | +-----------------+ + ------ + tail->| |<-hwtail | |<-hwlease | (being | ... | | ... | prepared) | ... | | ... +-----------------+ ... | | ... | |<-hwlease +-----------------+ | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. * On an Rx ring, hwlease is always after hwtail, * and completions cause hwtail to advance. * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that * can be assigned. * nm_kr_lease() reserves the required number of buffers, * advances nkr_hwlease and also returns an entry in * a circular array where completions should be reported. */ struct netmap_lut { struct lut_entry *lut; uint32_t objtotal; /* max buffer index */ uint32_t objsize; /* buffer size */ }; struct netmap_vp_adapter; // forward /* * The "struct netmap_adapter" extends the "struct adapter" * (or equivalent) device descriptor. * It contains all base fields needed to support netmap operation. * There are in fact different types of netmap adapters * (native, generic, VALE switch...) so a netmap_adapter is * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ uint32_t magic; uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ #define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when * forwarding packets coming from this * interface */ #define NAF_MEM_OWNER 8 /* the adapter uses its own memory area * that cannot be changed */ #define NAF_NATIVE 16 /* the adapter is native. * Virtual ports (non persistent vale ports, * pipes, monitors...) should never use * this flag. */ #define NAF_NETMAP_ON 32 /* netmap is active (either native or * emulated). Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ #define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ #define NAF_PTNETMAP_HOST 256 /* the adapter supports ptnetmap in the host */ #define NAF_ZOMBIE (1U<<30) /* the nic driver has been unloaded */ #define NAF_BUSY (1U<<31) /* the adapter is used internally and * cannot be registered from userspace */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ u_int num_tx_desc; /* number of descriptor in each queue */ u_int num_rx_desc; /* tx_rings and rx_rings are private but allocated * as a contiguous chunk of memory. Each array has * N+1 entries, for the adapter queues and for the host queue. */ struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ void *tailroom; /* space below the rings array */ /* (used for leases) */ NM_SELINFO_T si[NR_TXRX]; /* global wait queues */ /* count users of the global wait queues */ int si_users[NR_TXRX]; void *pdev; /* used to store pci device */ /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); /* copy of if_input for netmap_send_up() */ void (*if_input)(struct ifnet *, struct mbuf *); /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ /*---- callbacks for this netmap adapter -----*/ /* * nm_dtor() is the cleanup routine called when destroying * the adapter. * Called with NMG_LOCK held. * * nm_register() is called on NIOCREGIF and close() to enter * or exit netmap mode on the NIC * Called with NNG_LOCK held. * * nm_txsync() pushes packets to the underlying hw/switch * * nm_rxsync() collects packets from the underlying hw/switch * * nm_config() returns configuration information from the OS * Called with NMG_LOCK held. * * nm_krings_create() create and init the tx_rings and * rx_rings arrays of kring structures. In particular, * set the nm_sync callbacks for each ring. * There is no need to also allocate the corresponding * netmap_rings, since netmap_mem_rings_create() will always * be called to provide the missing ones. * Called with NNG_LOCK held. * * nm_krings_delete() cleanup and delete the tx_rings and rx_rings * arrays * Called with NMG_LOCK held. * * nm_notify() is used to act after data have become available * (or the stopped state of the ring has changed) * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. * This callback pointer is actually used only to initialize * kring->nm_notify. * Return values are the same as for netmap_rx_irq(). */ void (*nm_dtor)(struct netmap_adapter *); int (*nm_register)(struct netmap_adapter *, int onoff); void (*nm_intr)(struct netmap_adapter *, int onoff); int (*nm_txsync)(struct netmap_kring *kring, int flags); int (*nm_rxsync)(struct netmap_kring *kring, int flags); int (*nm_notify)(struct netmap_kring *kring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 /* return configuration information */ int (*nm_config)(struct netmap_adapter *, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); int (*nm_krings_create)(struct netmap_adapter *); void (*nm_krings_delete)(struct netmap_adapter *); #ifdef WITH_VALE /* * nm_bdg_attach() initializes the na_vp field to point * to an adapter that can be attached to a VALE switch. If the * current adapter is already a VALE port, na_vp is simply a cast; * otherwise, na_vp points to a netmap_bwrap_adapter. * If applicable, this callback also initializes na_hostvp, * that can be used to connect the adapter host rings to the * switch. * Called with NMG_LOCK held. * * nm_bdg_ctl() is called on the actual attach/detach to/from * to/from the switch, to perform adapter-specific * initializations * Called with NMG_LOCK held. */ int (*nm_bdg_attach)(const char *bdg_name, struct netmap_adapter *); int (*nm_bdg_ctl)(struct netmap_adapter *, struct nmreq *, int); /* adapter used to attach this adapter to a VALE switch (if any) */ struct netmap_vp_adapter *na_vp; /* adapter used to attach the host rings of this adapter * to a VALE switch (if any) */ struct netmap_vp_adapter *na_hostvp; #endif /* standard refcount to control the lifetime of the adapter * (it should be equal to the lifetime of the corresponding ifp) */ int na_refcount; /* memory allocator (opaque) * We also cache a pointer to the lut_entry for translating * buffer addresses, the total number of buffers and the buffer size. */ struct netmap_mem_d *nm_mem; struct netmap_lut na_lut; /* additional information attached to this adapter * by other netmap subsystems. Currently used by * bwrap, LINUX/v1000 and ptnetmap */ void *na_private; /* array of pipes that have this adapter as a parent */ struct netmap_pipe_adapter **na_pipes; int na_next_pipe; /* next free slot in the array */ int na_max_pipes; /* size of the array */ /* Offset of ethernet header for each packet. */ u_int virt_hdr_len; char name[64]; }; static __inline u_int nma_get_ndesc(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_desc : na->num_rx_desc); } static __inline void nma_set_ndesc(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_desc = v; else na->num_rx_desc = v; } static __inline u_int nma_get_nrings(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->num_tx_rings : na->num_rx_rings); } static __inline void nma_set_nrings(struct netmap_adapter *na, enum txrx t, u_int v) { if (t == NR_TX) na->num_tx_rings = v; else na->num_rx_rings = v; } static __inline struct netmap_kring* NMR(struct netmap_adapter *na, enum txrx t) { return (t == NR_TX ? na->tx_rings : na->rx_rings); } /* * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; * if the NIC is owned by a user, only users can share it. * Evaluation must be done under NMG_LOCK(). */ #define NETMAP_OWNED_BY_KERN(na) ((na)->na_flags & NAF_BUSY) #define NETMAP_OWNED_BY_ANY(na) \ (NETMAP_OWNED_BY_KERN(na) || ((na)->active_fds > 0)) /* * derived netmap adapters for various types of ports */ struct netmap_vp_adapter { /* VALE software port */ struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; * na_bdg points to the bridge this NA is attached to. */ int bdg_port; struct nm_bridge *na_bdg; int retry; /* Maximum Frame Size, used in bdg_mismatch_datapath() */ u_int mfs; /* Last source MAC on this port */ uint64_t last_smac; }; struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; struct net_device_ops nm_ndo; // XXX linux only struct ethtool_ops nm_eto; // XXX linux only const struct ethtool_ops* save_ethtool; int (*nm_hw_register)(struct netmap_adapter *, int onoff); }; #ifdef WITH_GENERIC /* Mitigation support. */ struct nm_generic_mit { struct hrtimer mit_timer; int mit_pending; int mit_ring_idx; /* index of the ring being mitigated */ struct netmap_adapter *mit_na; /* backpointer */ }; struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; /* Pointer to a previously used netmap adapter. */ struct netmap_adapter *prev; /* generic netmap adapters support: * a net_device_ops struct overrides ndo_select_queue(), * save_if_input saves the if_input hook (FreeBSD), * mit implements rx interrupt mitigation, */ struct net_device_ops generic_ndo; void (*save_if_input)(struct ifnet *, struct mbuf *); struct nm_generic_mit *mit; #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif /* Is the adapter able to use multiple RX slots to scatter * each packet pushed up by the driver? */ int rxsg; /* Is the transmission path controlled by a netmap-aware * device queue (i.e. qdisc on linux)? */ int txqdisc; }; #endif /* WITH_GENERIC */ static __inline int netmap_real_rings(struct netmap_adapter *na, enum txrx t) { return nma_get_nrings(na, t) + !!(na->na_flags & NAF_HOST_RINGS); } #ifdef WITH_VALE struct nm_bdg_polling_state; /* * Bridge wrapper for non VALE ports attached to a VALE switch. * * The real device must already have its own netmap adapter (hwna). * The bridge wrapper and the hwna adapter share the same set of * netmap rings and buffers, but they have two separate sets of * krings descriptors, with tx/rx meanings swapped: * * netmap * bwrap krings rings krings hwna * +------+ +------+ +-----+ +------+ +------+ * |tx_rings->| |\ /| |----| |<-tx_rings| * | | +------+ \ / +-----+ +------+ | | * | | X | | * | | / \ | | * | | +------+/ \+-----+ +------+ | | * |rx_rings->| | | |----| |<-rx_rings| * | | +------+ +-----+ +------+ | | * +------+ +------+ * * - packets coming from the bridge go to the brwap rx rings, * which are also the hwna tx rings. The bwrap notify callback * will then complete the hwna tx (see netmap_bwrap_notify). * * - packets coming from the outside go to the hwna rx rings, * which are also the bwrap tx rings. The (overwritten) hwna * notify method will then complete the bridge tx * (see netmap_bwrap_intr_notify). * * The bridge wrapper may optionally connect the hwna 'host' rings * to the bridge. This is done by using a second port in the * bridge and connecting it to the 'host' netmap_vp_adapter * contained in the netmap_bwrap_adapter. The brwap host adapter * cross-links the hwna host rings in the same way as shown above. * * - packets coming from the bridge and directed to the host stack * are handled by the bwrap host notify callback * (see netmap_bwrap_host_notify) * * - packets coming from the host stack are still handled by the * overwritten hwna notify callback (netmap_bwrap_intr_notify), * but are diverted to the host adapter depending on the ring number. * */ struct netmap_bwrap_adapter { struct netmap_vp_adapter up; struct netmap_vp_adapter host; /* for host rings */ struct netmap_adapter *hwna; /* the underlying device */ /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the n_detmap_priv_d data structure. * This is only done when physical interfaces * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; struct nm_bdg_polling_state *na_polling_state; }; int netmap_bwrap_attach(const char *name, struct netmap_adapter *); #endif /* WITH_VALE */ #ifdef WITH_PIPES #define NM_MAXPIPES 64 /* max number of pipes per adapter */ struct netmap_pipe_adapter { struct netmap_adapter up; u_int id; /* pipe identifier */ int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ struct netmap_adapter *parent; /* adapter that owns the memory */ struct netmap_pipe_adapter *peer; /* the other end of the pipe */ int peer_ref; /* 1 iff we are holding a ref to the peer */ u_int parent_slot; /* index in the parent pipe array */ }; #endif /* WITH_PIPES */ /* return slots reserved to rx clients; used in drivers */ static inline uint32_t nm_kr_rxspace(struct netmap_kring *k) { int space = k->nr_hwtail - k->nr_hwcur; if (space < 0) space += k->nkr_num_slots; ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); return space; } /* return slots reserved to tx clients */ #define nm_kr_txspace(_k) nm_kr_rxspace(_k) /* True if no space in the tx ring, only valid after txsync_prologue */ static inline int nm_kr_txempty(struct netmap_kring *kring) { return kring->rcur == kring->nr_hwtail; } /* True if no more completed slots in the rx ring, only valid after * rxsync_prologue */ #define nm_kr_rxempty(_k) nm_kr_txempty(_k) /* * protect against multiple threads using the same ring. * also check that the ring has not been stopped or locked */ #define NM_KR_BUSY 1 /* some other thread is syncing the ring */ #define NM_KR_STOPPED 2 /* unbounded stop (ifconfig down or driver unload) */ #define NM_KR_LOCKED 3 /* bounded, brief stop for mutual exclusion */ /* release the previously acquired right to use the *sync() methods of the ring */ static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } /* true if the ifp that backed the adapter has disappeared (e.g., the * driver has been unloaded) */ static inline int nm_iszombie(struct netmap_adapter *na); /* try to obtain exclusive right to issue the *sync() operations on the ring. * The right is obtained and must be later relinquished via nm_kr_put() if and * only if nm_kr_tryget() returns 0. * If can_sleep is 1 there are only two other possible outcomes: * - the function returns NM_KR_BUSY * - the function returns NM_KR_STOPPED and sets the POLLERR bit in *perr * (if non-null) * In both cases the caller will typically skip the ring, possibly collecting * errors along the way. * If the calling context does not allow sleeping, the caller must pass 0 in can_sleep. * In the latter case, the function may also return NM_KR_LOCKED and leave *perr * untouched: ideally, the caller should try again at a later time. */ static __inline int nm_kr_tryget(struct netmap_kring *kr, int can_sleep, int *perr) { int busy = 1, stopped; /* check a first time without taking the lock * to avoid starvation for nm_kr_get() */ retry: stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } busy = NM_ATOMIC_TEST_AND_SET(&kr->nr_busy); /* we should not return NM_KR_BUSY if the ring was * actually stopped, so check another time after * the barrier provided by the atomic operation */ stopped = kr->nkr_stopped; if (unlikely(stopped)) { goto stop; } if (unlikely(nm_iszombie(kr->na))) { stopped = NM_KR_STOPPED; goto stop; } return unlikely(busy) ? NM_KR_BUSY : 0; stop: if (!busy) nm_kr_put(kr); if (stopped == NM_KR_STOPPED) { /* if POLLERR is defined we want to use it to simplify netmap_poll(). * Otherwise, any non-zero value will do. */ #ifdef POLLERR #define NM_POLLERR POLLERR #else #define NM_POLLERR 1 #endif /* POLLERR */ if (perr) *perr |= NM_POLLERR; #undef NM_POLLERR } else if (can_sleep) { tsleep(kr, 0, "NM_KR_TRYGET", 4); goto retry; } return stopped; } /* put the ring in the 'stopped' state and wait for the current user (if any) to * notice. stopped must be either NM_KR_STOPPED or NM_KR_LOCKED */ static __inline void nm_kr_stop(struct netmap_kring *kr, int stopped) { kr->nkr_stopped = stopped; while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } /* restart a ring after a stop */ static __inline void nm_kr_start(struct netmap_kring *kr) { kr->nkr_stopped = 0; nm_kr_put(kr); } /* * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the * struct netmap_ring's and the struct selinfo. * * netmap_detach() frees the memory allocated by netmap_attach(). * * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset * the dmamap for a packet buffer * * netmap_reset() is a helper routine to be called in the hw driver * when reinitializing a ring. It should not be called by * virtual ports (vale, pipes, monitor) */ int netmap_attach(struct netmap_adapter *); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); /* Return codes for netmap_*x_irq. */ enum { /* Driver should do normal interrupt processing, e.g. because * the interface is not in netmap mode. */ NM_IRQ_PASS = 0, /* Port is in netmap mode, and the interrupt work has been * completed. The driver does not have to notify netmap * again before the next interrupt. */ NM_IRQ_COMPLETED = -1, /* Port is in netmap mode, but the interrupt work has not been * completed. The driver has to make sure netmap will be * notified again soon, even if no more interrupts come (e.g. * on Linux the driver should not call napi_complete()). */ NM_IRQ_RESCHED = -2, }; /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) int netmap_common_irq(struct netmap_adapter *, u_int, u_int *work_done); #ifdef WITH_VALE /* functions used by external modules to interface with VALE */ #define netmap_vp_to_ifp(_vp) ((_vp)->up.ifp) #define netmap_ifp_to_vp(_ifp) (NA(_ifp)->na_vp) #define netmap_ifp_to_host_vp(_ifp) (NA(_ifp)->na_hostvp) #define netmap_bdg_idx(_vp) ((_vp)->bdg_port) const char *netmap_bdg_name(struct netmap_vp_adapter *); #else /* !WITH_VALE */ #define netmap_vp_to_ifp(_vp) NULL #define netmap_ifp_to_vp(_ifp) NULL #define netmap_ifp_to_host_vp(_ifp) NULL #define netmap_bdg_idx(_vp) -1 #define netmap_bdg_name(_vp) NULL #endif /* WITH_VALE */ static inline int nm_netmap_on(struct netmap_adapter *na) { return na && na->na_flags & NAF_NETMAP_ON; } static inline int nm_native_on(struct netmap_adapter *na) { return nm_netmap_on(na) && (na->na_flags & NAF_NATIVE); } static inline int nm_iszombie(struct netmap_adapter *na) { return na == NULL || (na->na_flags & NAF_ZOMBIE); } static inline void nm_update_hostrings_mode(struct netmap_adapter *na) { /* Process nr_mode and nr_pending_mode for host rings. */ na->tx_rings[na->num_tx_rings].nr_mode = na->tx_rings[na->num_tx_rings].nr_pending_mode; na->rx_rings[na->num_rx_rings].nr_mode = na->rx_rings[na->num_rx_rings].nr_pending_mode; } /* set/clear native flags and if_transmit/netdev_ops */ static inline void nm_set_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; /* We do the setup for intercepting packets only if we are the * first user of this adapapter. */ if (na->active_fds > 0) { return; } na->na_flags |= NAF_NETMAP_ON; #ifdef IFCAP_NETMAP /* or FreeBSD ? */ ifp->if_capenable |= IFCAP_NETMAP; #endif #if defined (__FreeBSD__) na->if_transmit = ifp->if_transmit; ifp->if_transmit = netmap_transmit; #elif defined (_WIN32) (void)ifp; /* prevent a warning */ //XXX_ale can we just comment those? //na->if_transmit = ifp->if_transmit; //ifp->if_transmit = netmap_transmit; #else na->if_transmit = (void *)ifp->netdev_ops; ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; ((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops; ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto; #endif nm_update_hostrings_mode(na); } static inline void nm_clear_native_flags(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; /* We undo the setup for intercepting packets only if we are the * last user of this adapapter. */ if (na->active_fds > 0) { return; } nm_update_hostrings_mode(na); #if defined(__FreeBSD__) ifp->if_transmit = na->if_transmit; #elif defined(_WIN32) (void)ifp; /* prevent a warning */ //XXX_ale can we just comment those? //ifp->if_transmit = na->if_transmit; #else ifp->netdev_ops = (void *)na->if_transmit; ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool; #endif na->na_flags &= ~NAF_NETMAP_ON; #ifdef IFCAP_NETMAP /* or FreeBSD ? */ ifp->if_capenable &= ~IFCAP_NETMAP; #endif } /* * nm_*sync_prologue() functions are used in ioctl/poll and ptnetmap * kthreads. * We need netmap_ring* parameter, because in ptnetmap it is decoupled * from host kring. * The user-space ring pointers (head/cur/tail) are shared through * CSB between host and guest. */ /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size to force a reinit. */ uint32_t nm_txsync_prologue(struct netmap_kring *, struct netmap_ring *); /* * validates parameters in the ring/kring, returns a value for head * If any error, returns ring_size lim to force a reinit. */ uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *); /* check/fix address and len in tx rings */ #if 1 /* debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_a == NETMAP_BUF_BASE(_na) || _l > NETMAP_BUF_SIZE(_na)) { \ RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ kring->ring_id, nm_i, slot->buf_idx, len); \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } } while (0) #else /* no debug version */ #define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ if (_l > NETMAP_BUF_SIZE(_na)) \ _l = NETMAP_BUF_SIZE(_na); \ } while (0) #endif /*---------------------------------------------------------------*/ /* * Support routines used by netmap subsystems * (native drivers, VALE, generic, pipes, monitors, ...) */ /* common routine for all functions that create a netmap adapter. It performs * two main tasks: * - if the na points to an ifp, mark the ifp as netmap capable * using na as its native adapter; * - provide defaults for the setup callbacks and the memory allocator */ int netmap_attach_common(struct netmap_adapter *); /* common actions to be performed on netmap adapter destruction */ void netmap_detach_common(struct netmap_adapter *); /* fill priv->np_[tr]xq{first,last} using the ringid and flags information * coming from a struct nmreq */ int netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags); /* update the ring parameters (number and size of tx and rx rings). * It calls the nm_config callback, if available. */ int netmap_update_config(struct netmap_adapter *na); /* create and initialize the common fields of the krings array. * using the information that must be already available in the na. * tailroom can be used to request the allocation of additional * tailroom bytes after the krings array. This is used by * netmap_vp_adapter's (i.e., VALE ports) to make room for * leasing-related data structures */ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); /* deletes the kring array of the adapter. The array must have * been created using netmap_krings_create */ void netmap_krings_delete(struct netmap_adapter *na); int netmap_hw_krings_create(struct netmap_adapter *na); void netmap_hw_krings_delete(struct netmap_adapter *na); /* set the stopped/enabled status of ring * When stopping, they also wait for all current activity on the ring to * terminate. The status change is then notified using the na nm_notify * callback. */ void netmap_set_ring(struct netmap_adapter *, u_int ring_id, enum txrx, int stopped); /* set the stopped/enabled status of all rings of the adapter. */ void netmap_set_all_rings(struct netmap_adapter *, int stopped); /* convenience wrappers for netmap_set_all_rings */ void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); int netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, uint32_t flags); void netmap_do_unregif(struct netmap_priv_d *priv); u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, struct ifnet **ifp, int create); void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); #ifdef WITH_VALE /* * The following bridge-related functions are used by other * kernel modules. * * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ typedef u_int (*bdg_lookup_fn_t)(struct nm_bdg_fwd *ft, uint8_t *ring_nr, struct netmap_vp_adapter *); typedef int (*bdg_config_fn_t)(struct nm_ifreq *); typedef void (*bdg_dtor_fn_t)(const struct netmap_vp_adapter *); struct netmap_bdg_ops { bdg_lookup_fn_t lookup; bdg_config_fn_t config; bdg_dtor_fn_t dtor; }; u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *); #define NM_BRIDGES 8 /* number of bridges */ #define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) /* these are redefined in case of no VALE support */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); struct nm_bridge *netmap_init_bridges2(u_int); void netmap_uninit_bridges2(struct nm_bridge *, u_int); int netmap_init_bridges(void); void netmap_uninit_bridges(void); int netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops); int netmap_bdg_config(struct nmreq *nmr); #else /* !WITH_VALE */ #define netmap_get_bdg_na(_1, _2, _3) 0 #define netmap_init_bridges(_1) 0 #define netmap_uninit_bridges() #define netmap_bdg_ctl(_1, _2) EINVAL #endif /* !WITH_VALE */ #ifdef WITH_PIPES /* max number of pipes per device */ #define NM_MAXPIPES 64 /* XXX how many? */ void netmap_pipe_dealloc(struct netmap_adapter *); int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); #else /* !WITH_PIPES */ #define NM_MAXPIPES 0 #define netmap_pipe_alloc(_1, _2) 0 #define netmap_pipe_dealloc(_1) #define netmap_get_pipe_na(nmr, _2, _3) \ ({ int role__ = (nmr)->nr_flags & NR_REG_MASK; \ (role__ == NR_REG_PIPE_MASTER || \ role__ == NR_REG_PIPE_SLAVE) ? EOPNOTSUPP : 0; }) #endif #ifdef WITH_MONITOR int netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create); void netmap_monitor_stop(struct netmap_adapter *na); #else #define netmap_get_monitor_na(nmr, _2, _3) \ ((nmr)->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX) ? EOPNOTSUPP : 0) #endif #ifdef CONFIG_NET_NS struct net *netmap_bns_get(void); void netmap_bns_put(struct net *); void netmap_bns_getbridges(struct nm_bridge **, u_int *); #else #define netmap_bns_get() #define netmap_bns_put(_1) #define netmap_bns_getbridges(b, n) \ do { *b = nm_bridges; *n = NM_BRIDGES; } while (0) #endif /* Various prototypes */ int netmap_poll(struct netmap_priv_d *, int events, NM_SELRECORD_T *td); int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); void netmap_dtor(void *data); int netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *); /* netmap_adapter creation/destruction */ // #define NM_DEBUG_PUTGET 1 #ifdef NM_DEBUG_PUTGET #define NM_DBG(f) __##f void __netmap_adapter_get(struct netmap_adapter *na); #define netmap_adapter_get(na) \ do { \ struct netmap_adapter *__na = na; \ D("getting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_get(__na); \ } while (0) int __netmap_adapter_put(struct netmap_adapter *na); #define netmap_adapter_put(na) \ ({ \ struct netmap_adapter *__na = na; \ D("putting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_put(__na); \ }) #else /* !NM_DEBUG_PUTGET */ #define NM_DBG(f) f void netmap_adapter_get(struct netmap_adapter *na); int netmap_adapter_put(struct netmap_adapter *na); #endif /* !NM_DEBUG_PUTGET */ /* * module variables */ #define NETMAP_BUF_BASE(_na) ((_na)->na_lut.lut[0].vaddr) #define NETMAP_BUF_SIZE(_na) ((_na)->na_lut.objsize) extern int netmap_no_pendintr; extern int netmap_mitigate; extern int netmap_verbose; /* for debugging */ enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ NM_VERB_HOST = 0x2, /* verbose host stack */ NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ NM_VERB_TXSYNC = 0x20, NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ NM_VERB_TXINTR = 0x200, NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ NM_VERB_NIC_TXSYNC = 0x2000, }; extern int netmap_txsync_retry; extern int netmap_flags; extern int netmap_generic_mit; extern int netmap_generic_ringsize; extern int netmap_generic_rings; extern int netmap_generic_txqdisc; /* * NA returns a pointer to the struct netmap adapter from the ifp, * WNA is used to write it. */ #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) /* * On old versions of FreeBSD, NA(ifp) is a pspare. On linux we * overload another pointer in the netdev. * * We check if NA(ifp) is set and its first element has a related * magic value. The capenable is within the struct netmap_adapter. */ #define NETMAP_MAGIC 0x52697a7a #define NM_NA_VALID(ifp) (NA(ifp) && \ ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) #define NM_ATTACH_NA(ifp, na) do { \ WNA(ifp) = na; \ if (NA(ifp)) \ NA(ifp)->magic = \ ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC; \ } while(0) #define NM_IS_NATIVE(ifp) (NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor) #if defined(__FreeBSD__) /* Assigns the device IOMMU domain to an allocator. * Returns -ENOMEM in case the domain is different */ #define nm_iommu_group_id(dev) (0) /* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { } /* bus_dmamap_load wrapper: call aforementioned function if map != NULL. * XXX can we do it without a callback ? */ static inline void netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { if (map) bus_dmamap_unload(tag, map); } /* update the map when a buffer changes. */ static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) { bus_dmamap_unload(tag, map); bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } #elif defined(_WIN32) #else /* linux */ int nm_iommu_group_id(bus_dma_tag_t dev); #include static inline void netmap_load_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (0 && map) { *map = dma_map_single(na->pdev, buf, NETMAP_BUF_SIZE(na), DMA_BIDIRECTIONAL); } } static inline void netmap_unload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map) { u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } } static inline void netmap_reload_map(struct netmap_adapter *na, bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { u_int sz = NETMAP_BUF_SIZE(na); if (*map) { dma_unmap_single(na->pdev, *map, sz, DMA_BIDIRECTIONAL); } *map = dma_map_single(na->pdev, buf, sz, DMA_BIDIRECTIONAL); } /* * XXX How do we redefine these functions: * * on linux we need * dma_map_single(&pdev->dev, virt_addr, len, direction) * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction * The len can be implicit (on netmap it is NETMAP_BUF_SIZE) * unfortunately the direction is not, so we need to change * something to have a cross API */ #if 0 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; /* set time_stamp *before* dma to help avoid a possible race */ buffer_info->time_stamp = jiffies; buffer_info->mapped_as_page = false; buffer_info->length = len; //buffer_info->next_to_watch = l; /* reload dma map */ dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, NETMAP_BUF_SIZE, DMA_TO_DEVICE); buffer_info->dma = dma_map_single(&adapter->pdev->dev, addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { D("dma mapping error"); /* goto dma_error; See e1000_put_txbuf() */ /* XXX reset */ } tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX #endif /* * The bus_dmamap_sync() can be one of wmb() or rmb() depending on direction. */ #define bus_dmamap_sync(_a, _b, _c) #endif /* linux */ /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ static inline int netmap_idx_n2k(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; idx += kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } static inline int netmap_idx_k2n(struct netmap_kring *kr, int idx) { int n = kr->nkr_num_slots; idx -= kr->nkr_hwofs; if (idx < 0) return idx + n; else if (idx < n) return idx; else return idx - n; } /* Entries of the look-up table. */ struct lut_entry { void *vaddr; /* virtual address. */ vm_paddr_t paddr; /* physical address. */ }; struct netmap_obj_pool; /* * NMB return the virtual address of a buffer (buffer 0 on bad index) * PNMB also fills the physical address */ static inline void * NMB(struct netmap_adapter *na, struct netmap_slot *slot) { struct lut_entry *lut = na->na_lut.lut; uint32_t i = slot->buf_idx; return (unlikely(i >= na->na_lut.objtotal)) ? lut[0].vaddr : lut[i].vaddr; } static inline void * PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) { uint32_t i = slot->buf_idx; struct lut_entry *lut = na->na_lut.lut; void *ret = (i >= na->na_lut.objtotal) ? lut[0].vaddr : lut[i].vaddr; #ifndef _WIN32 *pp = (i >= na->na_lut.objtotal) ? lut[0].paddr : lut[i].paddr; #else *pp = (i >= na->na_lut.objtotal) ? (uint64_t)lut[0].paddr.QuadPart : (uint64_t)lut[i].paddr.QuadPart; #endif return ret; } /* * Structure associated to each netmap file descriptor. * It is created on open and left unbound (np_nifp == NULL). * A successful NIOCREGIF will set np_nifp and the first few fields; * this is protected by a global lock (NMG_LOCK) due to low contention. * * np_refs counts the number of references to the structure: one for the fd, * plus (on FreeBSD) one for each active mmap which we track ourselves * (linux automatically tracks them, but FreeBSD does not). * np_refs is protected by NMG_LOCK. * * Read access to the structure is lock free, because ni_nifp once set * can only go to 0 when nobody is using the entry anymore. Readers * must check that np_nifp != NULL before using the other fields. */ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; struct ifnet *np_ifp; uint32_t np_flags; /* from the ioctl */ u_int np_qfirst[NR_TXRX], np_qlast[NR_TXRX]; /* range of tx/rx rings to scan */ uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ int np_refs; /* use with NMG_LOCK held */ /* pointers to the selinfo to be used for selrecord. * Either the local or the global one depending on the * number of rings. */ NM_SELINFO_T *np_si[NR_TXRX]; struct thread *np_td; /* kqueue, just debugging */ }; struct netmap_priv_d *netmap_priv_new(void); void netmap_priv_delete(struct netmap_priv_d *); static inline int nm_kring_pending(struct netmap_priv_d *np) { struct netmap_adapter *na = np->np_na; enum txrx t; int i; for_rx_tx(t) { for (i = np->np_qfirst[t]; i < np->np_qlast[t]; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; if (kring->nr_mode != kring->nr_pending_mode) { return 1; } } } return 0; } #ifdef WITH_MONITOR struct netmap_monitor_adapter { struct netmap_adapter up; struct netmap_priv_d priv; uint32_t flags; }; #endif /* WITH_MONITOR */ #ifdef WITH_GENERIC /* * generic netmap emulation for devices that do not have * native netmap support. */ int generic_netmap_attach(struct ifnet *ifp); int generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept); int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept); /* * the generic transmit routine is passed a structure to optionally * build a queue of descriptors, in an OS-specific way. * The payload is at addr, if non-null, and the routine should send or queue * the packet, returning 0 if successful, 1 on failure. * * At the end, if head is non-null, there will be an additional call * to the function with addr = NULL; this should tell the OS-specific * routine to send the queue and free any resources. Failure is ignored. */ struct nm_os_gen_arg { struct ifnet *ifp; void *m; /* os-specific mbuf-like object */ void *head, *tail; /* tailq, if the OS-specific routine needs to build one */ void *addr; /* payload of current packet */ u_int len; /* packet length */ u_int ring_nr; /* packet length */ u_int qevent; /* in txqdisc mode, place an event on this mbuf */ }; int nm_os_generic_xmit_frame(struct nm_os_gen_arg *); int nm_os_generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void nm_os_generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); void nm_os_generic_set_features(struct netmap_generic_adapter *gna); static inline struct ifnet* netmap_generic_getifp(struct netmap_generic_adapter *gna) { if (gna->prev) return gna->prev->ifp; return gna->up.up.ifp; } void netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done); //#define RATE_GENERIC /* Enables communication statistics for generic. */ #ifdef RATE_GENERIC void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); #else #define generic_rate(txp, txs, txi, rxp, rxs, rxi) #endif /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na); void nm_os_mitigation_start(struct nm_generic_mit *mit); void nm_os_mitigation_restart(struct nm_generic_mit *mit); int nm_os_mitigation_active(struct nm_generic_mit *mit); void nm_os_mitigation_cleanup(struct nm_generic_mit *mit); #else /* !WITH_GENERIC */ #define generic_netmap_attach(ifp) (EOPNOTSUPP) #endif /* WITH_GENERIC */ /* Shared declarations for the VALE switch. */ /* * Each transmit queue accumulates a batch of packets into * a structure before forwarding. Packets to the same * destination are put in a list using ft_next as a link field. * ft_frags and ft_next are valid only on the first fragment. */ struct nm_bdg_fwd { /* forwarding entry for a bridge */ void *ft_buf; /* netmap or indirect buffer */ uint8_t ft_frags; /* how many fragments (only on 1st frag) */ uint8_t _ft_port; /* dst port (unused) */ uint16_t ft_flags; /* flags, e.g. indirect */ uint16_t ft_len; /* src fragment len */ uint16_t ft_next; /* next packet to same destination */ }; /* struct 'virtio_net_hdr' from linux. */ struct nm_vnet_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ uint8_t flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ uint8_t gso_type; uint16_t hdr_len; uint16_t gso_size; uint16_t csum_start; uint16_t csum_offset; }; #define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ /* Private definitions for IPv4, IPv6, UDP and TCP headers. */ struct nm_iphdr { uint8_t version_ihl; uint8_t tos; uint16_t tot_len; uint16_t id; uint16_t frag_off; uint8_t ttl; uint8_t protocol; uint16_t check; uint32_t saddr; uint32_t daddr; /*The options start here. */ }; struct nm_tcphdr { uint16_t source; uint16_t dest; uint32_t seq; uint32_t ack_seq; uint8_t doff; /* Data offset + Reserved */ uint8_t flags; uint16_t window; uint16_t check; uint16_t urg_ptr; }; struct nm_udphdr { uint16_t source; uint16_t dest; uint16_t len; uint16_t check; }; struct nm_ipv6hdr { uint8_t priority_version; uint8_t flow_lbl[3]; uint16_t payload_len; uint8_t nexthdr; uint8_t hop_limit; uint8_t saddr[16]; uint8_t daddr[16]; }; /* Type used to store a checksum (in host byte order) that hasn't been * folded yet. */ #define rawsum_t uint32_t rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph); void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check); void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check); uint16_t nm_os_csum_fold(rawsum_t cur_sum); void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, const struct nm_bdg_fwd *ft_p, struct netmap_ring *dst_ring, u_int *j, u_int lim, u_int *howmany); /* persistent virtual port routines */ int nm_os_vi_persist(const char *, struct ifnet **); void nm_os_vi_detach(struct ifnet *); void nm_os_vi_init_index(void); /* * kernel thread routines */ struct nm_kthread; /* OS-specific kthread - opaque */ typedef void (*nm_kthread_worker_fn_t)(void *data); /* kthread configuration */ struct nm_kthread_cfg { long type; /* kthread type/identifier */ - struct ptnet_ring_cfg event; /* event/ioctl fd */ nm_kthread_worker_fn_t worker_fn; /* worker function */ void *worker_private;/* worker parameter */ int attach_user; /* attach kthread to user process */ }; /* kthread configuration */ -struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg); +struct nm_kthread *nm_os_kthread_create(struct nm_kthread_cfg *cfg, + unsigned int cfgtype, + void *opaque); int nm_os_kthread_start(struct nm_kthread *); void nm_os_kthread_stop(struct nm_kthread *); void nm_os_kthread_delete(struct nm_kthread *); void nm_os_kthread_wakeup_worker(struct nm_kthread *nmk); void nm_os_kthread_send_irq(struct nm_kthread *); void nm_os_kthread_set_affinity(struct nm_kthread *, int); u_int nm_os_ncpus(void); #ifdef WITH_PTNETMAP_HOST /* * netmap adapter for host ptnetmap ports */ struct netmap_pt_host_adapter { struct netmap_adapter up; struct netmap_adapter *parent; int (*parent_nm_notify)(struct netmap_kring *kring, int flags); void *ptns; }; /* ptnetmap HOST routines */ int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create); int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na); static inline int nm_ptnetmap_host_on(struct netmap_adapter *na) { return na && na->na_flags & NAF_PTNETMAP_HOST; } #else /* !WITH_PTNETMAP_HOST */ #define netmap_get_pt_host_na(nmr, _2, _3) \ ((nmr)->nr_flags & (NR_PTNETMAP_HOST) ? EOPNOTSUPP : 0) #define ptnetmap_ctl(_1, _2) EINVAL #define nm_ptnetmap_host_on(_1) EINVAL #endif /* !WITH_PTNETMAP_HOST */ #ifdef WITH_PTNETMAP_GUEST /* ptnetmap GUEST routines */ -typedef uint32_t (*nm_pt_guest_ptctl_t)(struct ifnet *, uint32_t); - /* * netmap adapter for guest ptnetmap ports */ struct netmap_pt_guest_adapter { /* The netmap adapter to be used by netmap applications. * This field must be the first, to allow upcast. */ struct netmap_hw_adapter hwup; /* The netmap adapter to be used by the driver. */ struct netmap_hw_adapter dr; void *csb; /* Reference counter to track users of backend netmap port: the * network stack and netmap clients. * Used to decide when we need (de)allocate krings/rings and * start (stop) ptnetmap kthreads. */ int backend_regifs; }; -int netmap_pt_guest_attach(struct netmap_adapter *, void *, - unsigned int, nm_pt_guest_ptctl_t); +int netmap_pt_guest_attach(struct netmap_adapter *na, void *csb, + unsigned int nifp_offset, unsigned int memid); struct ptnet_ring; bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring, int flags); bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring, int flags); int ptnet_nm_krings_create(struct netmap_adapter *na); void ptnet_nm_krings_delete(struct netmap_adapter *na); void ptnet_nm_dtor(struct netmap_adapter *na); #endif /* WITH_PTNETMAP_GUEST */ #endif /* _NET_NETMAP_KERN_H_ */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.c (revision 308054) @@ -1,2458 +1,2399 @@ /* * Copyright (C) 2012-2014 Matteo Landi * Copyright (C) 2012-2016 Luigi Rizzo * Copyright (C) 2012-2016 Giuseppe Lettieri * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef linux #include "bsd_glue.h" #endif /* linux */ #ifdef __APPLE__ #include "osx_glue.h" #endif /* __APPLE__ */ #ifdef __FreeBSD__ #include /* prerequisite */ __FBSDID("$FreeBSD$"); #include #include #include /* MALLOC_DEFINE */ #include #include /* vtophys */ #include /* vtophys */ #include /* sockaddrs */ #include #include #include #include #include #include /* bus_dmamap_* */ /* M_NETMAP only used in here */ MALLOC_DECLARE(M_NETMAP); MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); #endif /* __FreeBSD__ */ #ifdef _WIN32 #include #endif #include #include #include #include "netmap_mem2.h" #ifdef _WIN32_USE_SMALL_GENERIC_DEVICES_MEMORY #define NETMAP_BUF_MAX_NUM 8*4096 /* if too big takes too much time to allocate */ #else #define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ #endif #define NETMAP_POOL_MAX_NAMSZ 32 enum { NETMAP_IF_POOL = 0, NETMAP_RING_POOL, NETMAP_BUF_POOL, NETMAP_POOLS_NR }; struct netmap_obj_params { u_int size; u_int num; }; struct netmap_obj_pool { char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */ /* ---------------------------------------------------*/ /* these are only meaningful if the pool is finalized */ /* (see 'finalized' field in netmap_mem_d) */ u_int objtotal; /* actual total number of objects. */ u_int memtotal; /* actual total memory space */ u_int numclusters; /* actual number of clusters */ u_int objfree; /* number of free objects. */ struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ uint32_t *bitmap; /* one bit per buffer, 1 means free */ uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ /* ---------------------------------------------------*/ /* limits */ u_int objminsize; /* minimum object size */ u_int objmaxsize; /* maximum object size */ u_int nummin; /* minimum number of objects */ u_int nummax; /* maximum number of objects */ /* these are changed only by config */ u_int _objtotal; /* total number of objects */ u_int _objsize; /* object size */ u_int _clustsize; /* cluster size */ u_int _clustentries; /* objects per cluster */ u_int _numclusters; /* number of clusters */ /* requested values */ u_int r_objtotal; u_int r_objsize; }; #define NMA_LOCK_T NM_MTX_T struct netmap_mem_ops { int (*nmd_get_lut)(struct netmap_mem_d *, struct netmap_lut*); int (*nmd_get_info)(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); vm_paddr_t (*nmd_ofstophys)(struct netmap_mem_d *, vm_ooffset_t); int (*nmd_config)(struct netmap_mem_d *); int (*nmd_finalize)(struct netmap_mem_d *); void (*nmd_deref)(struct netmap_mem_d *); ssize_t (*nmd_if_offset)(struct netmap_mem_d *, const void *vaddr); void (*nmd_delete)(struct netmap_mem_d *); struct netmap_if * (*nmd_if_new)(struct netmap_adapter *); void (*nmd_if_delete)(struct netmap_adapter *, struct netmap_if *); int (*nmd_rings_create)(struct netmap_adapter *); void (*nmd_rings_delete)(struct netmap_adapter *); }; typedef uint16_t nm_memid_t; -/* - * Shared info for netmap allocator - * - * Each allocator contains this structur as first netmap_if. - * In this way, we can share same details about allocator - * to the VM. - * Used in ptnetmap. - */ -struct netmap_mem_shared_info { -#ifndef _WIN32 - struct netmap_if up; /* ends with a 0-sized array, which VSC does not like */ -#else /* !_WIN32 */ - char up[sizeof(struct netmap_if)]; -#endif /* !_WIN32 */ - uint64_t features; -#define NMS_FEAT_BUF_POOL 0x0001 -#define NMS_FEAT_MEMSIZE 0x0002 - - uint32_t buf_pool_offset; - uint32_t buf_pool_objtotal; - uint32_t buf_pool_objsize; - uint32_t totalsize; -}; - -#define NMS_NAME "nms_info" -#define NMS_VERSION 1 -static const struct netmap_if nms_if_blueprint = { - .ni_name = NMS_NAME, - .ni_version = NMS_VERSION, - .ni_tx_rings = 0, - .ni_rx_rings = 0 -}; - struct netmap_mem_d { NMA_LOCK_T nm_mtx; /* protect the allocator */ u_int nm_totalsize; /* shorthand */ u_int flags; #define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */ int lasterr; /* last error for curr config */ int active; /* active users */ int refcount; /* the three allocators */ struct netmap_obj_pool pools[NETMAP_POOLS_NR]; nm_memid_t nm_id; /* allocator identifier */ int nm_grp; /* iommu groupd id */ /* list of all existing allocators, sorted by nm_id */ struct netmap_mem_d *prev, *next; struct netmap_mem_ops *ops; }; /* * XXX need to fix the case of t0 == void */ #define NMD_DEFCB(t0, name) \ t0 \ netmap_mem_##name(struct netmap_mem_d *nmd) \ { \ return nmd->ops->nmd_##name(nmd); \ } #define NMD_DEFCB1(t0, name, t1) \ t0 \ netmap_mem_##name(struct netmap_mem_d *nmd, t1 a1) \ { \ return nmd->ops->nmd_##name(nmd, a1); \ } #define NMD_DEFCB3(t0, name, t1, t2, t3) \ t0 \ netmap_mem_##name(struct netmap_mem_d *nmd, t1 a1, t2 a2, t3 a3) \ { \ return nmd->ops->nmd_##name(nmd, a1, a2, a3); \ } #define NMD_DEFNACB(t0, name) \ t0 \ netmap_mem_##name(struct netmap_adapter *na) \ { \ return na->nm_mem->ops->nmd_##name(na); \ } #define NMD_DEFNACB1(t0, name, t1) \ t0 \ netmap_mem_##name(struct netmap_adapter *na, t1 a1) \ { \ return na->nm_mem->ops->nmd_##name(na, a1); \ } NMD_DEFCB1(int, get_lut, struct netmap_lut *); NMD_DEFCB3(int, get_info, u_int *, u_int *, uint16_t *); NMD_DEFCB1(vm_paddr_t, ofstophys, vm_ooffset_t); static int netmap_mem_config(struct netmap_mem_d *); NMD_DEFCB(int, config); NMD_DEFCB1(ssize_t, if_offset, const void *); NMD_DEFCB(void, delete); NMD_DEFNACB(struct netmap_if *, if_new); NMD_DEFNACB1(void, if_delete, struct netmap_if *); NMD_DEFNACB(int, rings_create); NMD_DEFNACB(void, rings_delete); static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *); static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *); static int nm_mem_assign_group(struct netmap_mem_d *, struct device *); #define NMA_LOCK_INIT(n) NM_MTX_INIT((n)->nm_mtx) #define NMA_LOCK_DESTROY(n) NM_MTX_DESTROY((n)->nm_mtx) #define NMA_LOCK(n) NM_MTX_LOCK((n)->nm_mtx) #define NMA_UNLOCK(n) NM_MTX_UNLOCK((n)->nm_mtx) #ifdef NM_DEBUG_MEM_PUTGET #define NM_DBG_REFC(nmd, func, line) \ printf("%s:%d mem[%d] -> %d\n", func, line, (nmd)->nm_id, (nmd)->refcount); #else #define NM_DBG_REFC(nmd, func, line) #endif #ifdef NM_DEBUG_MEM_PUTGET void __netmap_mem_get(struct netmap_mem_d *nmd, const char *func, int line) #else void netmap_mem_get(struct netmap_mem_d *nmd) #endif { NMA_LOCK(nmd); nmd->refcount++; NM_DBG_REFC(nmd, func, line); NMA_UNLOCK(nmd); } #ifdef NM_DEBUG_MEM_PUTGET void __netmap_mem_put(struct netmap_mem_d *nmd, const char *func, int line) #else void netmap_mem_put(struct netmap_mem_d *nmd) #endif { int last; NMA_LOCK(nmd); last = (--nmd->refcount == 0); NM_DBG_REFC(nmd, func, line); NMA_UNLOCK(nmd); if (last) netmap_mem_delete(nmd); } int netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) { if (nm_mem_assign_group(nmd, na->pdev) < 0) { return ENOMEM; } else { NMA_LOCK(nmd); nmd->lasterr = nmd->ops->nmd_finalize(nmd); NMA_UNLOCK(nmd); } if (!nmd->lasterr && na->pdev) netmap_mem_map(&nmd->pools[NETMAP_BUF_POOL], na); return nmd->lasterr; } -static int netmap_mem_init_shared_info(struct netmap_mem_d *nmd); - void netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na) { NMA_LOCK(nmd); netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na); if (nmd->active == 1) { u_int i; /* * Reset the allocator when it falls out of use so that any * pool resources leaked by unclean application exits are * reclaimed. */ for (i = 0; i < NETMAP_POOLS_NR; i++) { struct netmap_obj_pool *p; u_int j; p = &nmd->pools[i]; p->objfree = p->objtotal; /* * Reproduce the net effect of the M_ZERO malloc() * and marking of free entries in the bitmap that * occur in finalize_obj_allocator() */ memset(p->bitmap, '\0', sizeof(uint32_t) * ((p->objtotal + 31) / 32)); /* * Set all the bits in the bitmap that have * corresponding buffers to 1 to indicate they are * free. */ for (j = 0; j < p->objtotal; j++) { if (p->lut[j].vaddr != NULL) { p->bitmap[ (j>>5) ] |= ( 1 << (j & 31) ); } } } /* * Per netmap_mem_finalize_all(), * buffers 0 and 1 are reserved */ nmd->pools[NETMAP_BUF_POOL].objfree -= 2; if (nmd->pools[NETMAP_BUF_POOL].bitmap) { /* XXX This check is a workaround that prevents a * NULL pointer crash which currently happens only - * with ptnetmap guests. Also, - * netmap_mem_init_shared_info must not be called - * by ptnetmap guest. */ + * with ptnetmap guests. + * Removed shared-info --> is the bug still there? */ nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; - - /* expose info to the ptnetmap guest */ - netmap_mem_init_shared_info(nmd); } } nmd->ops->nmd_deref(nmd); NMA_UNLOCK(nmd); } /* accessor functions */ static int netmap_mem2_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) { lut->lut = nmd->pools[NETMAP_BUF_POOL].lut; lut->objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; lut->objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; return 0; } static struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, .num = 100, }, [NETMAP_RING_POOL] = { .size = 9*PAGE_SIZE, .num = 200, }, [NETMAP_BUF_POOL] = { .size = 2048, .num = NETMAP_BUF_MAX_NUM, }, }; static struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { .size = 1024, .num = 2, }, [NETMAP_RING_POOL] = { .size = 5*PAGE_SIZE, .num = 4, }, [NETMAP_BUF_POOL] = { .size = 2048, .num = 4098, }, }; /* * nm_mem is the memory allocator used for all physical interfaces * running in netmap mode. * Virtual (VALE) ports will have each its own allocator. */ extern struct netmap_mem_ops netmap_mem_global_ops; /* forward */ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .pools = { [NETMAP_IF_POOL] = { .name = "netmap_if", .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 10, /* don't be stingy */ .nummax = 10000, /* XXX very large */ }, [NETMAP_RING_POOL] = { .name = "netmap_ring", .objminsize = sizeof(struct netmap_ring), .objmaxsize = 32*PAGE_SIZE, .nummin = 2, .nummax = 1024, }, [NETMAP_BUF_POOL] = { .name = "netmap_buf", .objminsize = 64, .objmaxsize = 65536, .nummin = 4, .nummax = 1000000, /* one million! */ }, }, .nm_id = 1, .nm_grp = -1, .prev = &nm_mem, .next = &nm_mem, .ops = &netmap_mem_global_ops }; static struct netmap_mem_d *netmap_last_mem_d = &nm_mem; /* blueprint for the private memory allocators */ extern struct netmap_mem_ops netmap_mem_private_ops; /* forward */ /* XXX clang is not happy about using name as a print format */ static const struct netmap_mem_d nm_blueprint = { .pools = { [NETMAP_IF_POOL] = { .name = "%s_if", .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 1, .nummax = 100, }, [NETMAP_RING_POOL] = { .name = "%s_ring", .objminsize = sizeof(struct netmap_ring), .objmaxsize = 32*PAGE_SIZE, .nummin = 2, .nummax = 1024, }, [NETMAP_BUF_POOL] = { .name = "%s_buf", .objminsize = 64, .objmaxsize = 65536, .nummin = 4, .nummax = 1000000, /* one million! */ }, }, .flags = NETMAP_MEM_PRIVATE, .ops = &netmap_mem_private_ops }; /* memory allocator related sysctls */ #define STRINGIFY(x) #x #define DECLARE_SYSCTLS(id, name) \ SYSBEGIN(mem2_ ## name); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ "Default size of private netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ "Default number of private netmap " STRINGIFY(name) "s"); \ SYSEND SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); /* call with NMA_LOCK(&nm_mem) held */ static int nm_mem_assign_id_locked(struct netmap_mem_d *nmd) { nm_memid_t id; struct netmap_mem_d *scan = netmap_last_mem_d; int error = ENOMEM; do { /* we rely on unsigned wrap around */ id = scan->nm_id + 1; if (id == 0) /* reserve 0 as error value */ id = 1; scan = scan->next; if (id != scan->nm_id) { nmd->nm_id = id; nmd->prev = scan->prev; nmd->next = scan; scan->prev->next = nmd; scan->prev = nmd; netmap_last_mem_d = nmd; error = 0; break; } } while (scan != netmap_last_mem_d); return error; } /* call with NMA_LOCK(&nm_mem) *not* held */ static int nm_mem_assign_id(struct netmap_mem_d *nmd) { int ret; NMA_LOCK(&nm_mem); ret = nm_mem_assign_id_locked(nmd); NMA_UNLOCK(&nm_mem); return ret; } static void nm_mem_release_id(struct netmap_mem_d *nmd) { NMA_LOCK(&nm_mem); nmd->prev->next = nmd->next; nmd->next->prev = nmd->prev; if (netmap_last_mem_d == nmd) netmap_last_mem_d = nmd->prev; nmd->prev = nmd->next = NULL; NMA_UNLOCK(&nm_mem); } static int nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev) { int err = 0, id; id = nm_iommu_group_id(dev); if (netmap_verbose) D("iommu_group %d", id); NMA_LOCK(nmd); if (nmd->nm_grp < 0) nmd->nm_grp = id; if (nmd->nm_grp != id) nmd->lasterr = err = ENOMEM; NMA_UNLOCK(nmd); return err; } /* * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. */ static vm_paddr_t netmap_mem2_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) { int i; vm_ooffset_t o = offset; vm_paddr_t pa; struct netmap_obj_pool *p; NMA_LOCK(nmd); p = nmd->pools; for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i].memtotal, i++) { if (offset >= p[i].memtotal) continue; // now lookup the cluster's address #ifndef _WIN32 pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) + offset % p[i]._objsize; #else pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr); pa.QuadPart += offset % p[i]._objsize; #endif NMA_UNLOCK(nmd); return pa; } /* this is only in case of errors */ D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o, p[NETMAP_IF_POOL].memtotal, p[NETMAP_IF_POOL].memtotal + p[NETMAP_RING_POOL].memtotal, p[NETMAP_IF_POOL].memtotal + p[NETMAP_RING_POOL].memtotal + p[NETMAP_BUF_POOL].memtotal); NMA_UNLOCK(nmd); #ifndef _WIN32 return 0; // XXX bad address #else vm_paddr_t res; res.QuadPart = 0; return res; #endif } #ifdef _WIN32 /* * win32_build_virtual_memory_for_userspace * * This function get all the object making part of the pools and maps * a contiguous virtual memory space for the userspace * It works this way * 1 - allocate a Memory Descriptor List wide as the sum * of the memory needed for the pools * 2 - cycle all the objects in every pool and for every object do * * 2a - cycle all the objects in every pool, get the list * of the physical address descriptors * 2b - calculate the offset in the array of pages desciptor in the * main MDL * 2c - copy the descriptors of the object in the main MDL * * 3 - return the resulting MDL that needs to be mapped in userland * * In this way we will have an MDL that describes all the memory for the * objects in a single object */ PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd) { int i, j; u_int memsize, memflags, ofs = 0; PMDL mainMdl, tempMdl; if (netmap_mem_get_info(nmd, &memsize, &memflags, NULL)) { D("memory not finalised yet"); return NULL; } mainMdl = IoAllocateMdl(NULL, memsize, FALSE, FALSE, NULL); if (mainMdl == NULL) { D("failed to allocate mdl"); return NULL; } NMA_LOCK(nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { struct netmap_obj_pool *p = &nmd->pools[i]; int clsz = p->_clustsize; int clobjs = p->_clustentries; /* objects per cluster */ int mdl_len = sizeof(PFN_NUMBER) * BYTES_TO_PAGES(clsz); PPFN_NUMBER pSrc, pDst; /* each pool has a different cluster size so we need to reallocate */ tempMdl = IoAllocateMdl(p->lut[0].vaddr, clsz, FALSE, FALSE, NULL); if (tempMdl == NULL) { NMA_UNLOCK(nmd); D("fail to allocate tempMdl"); IoFreeMdl(mainMdl); return NULL; } pSrc = MmGetMdlPfnArray(tempMdl); /* create one entry per cluster, the lut[] has one entry per object */ for (j = 0; j < p->numclusters; j++, ofs += clsz) { pDst = &MmGetMdlPfnArray(mainMdl)[BYTES_TO_PAGES(ofs)]; MmInitializeMdl(tempMdl, p->lut[j*clobjs].vaddr, clsz); MmBuildMdlForNonPagedPool(tempMdl); /* compute physical page addresses */ RtlCopyMemory(pDst, pSrc, mdl_len); /* copy the page descriptors */ mainMdl->MdlFlags = tempMdl->MdlFlags; /* XXX what is in here ? */ } IoFreeMdl(tempMdl); } NMA_UNLOCK(nmd); return mainMdl; } #endif /* _WIN32 */ /* * helper function for OS-specific mmap routines (currently only windows). * Given an nmd and a pool index, returns the cluster size and number of clusters. * Returns 0 if memory is finalised and the pool is valid, otherwise 1. * It should be called under NMA_LOCK(nmd) otherwise the underlying info can change. */ int netmap_mem2_get_pool_info(struct netmap_mem_d* nmd, u_int pool, u_int *clustsize, u_int *numclusters) { if (!nmd || !clustsize || !numclusters || pool >= NETMAP_POOLS_NR) return 1; /* invalid arguments */ // NMA_LOCK_ASSERT(nmd); if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { *clustsize = *numclusters = 0; return 1; /* not ready yet */ } *clustsize = nmd->pools[pool]._clustsize; *numclusters = nmd->pools[pool].numclusters; return 0; /* success */ } static int netmap_mem2_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, nm_memid_t *id) { int error = 0; NMA_LOCK(nmd); error = netmap_mem_config(nmd); if (error) goto out; if (size) { if (nmd->flags & NETMAP_MEM_FINALIZED) { *size = nmd->nm_totalsize; } else { int i; *size = 0; for (i = 0; i < NETMAP_POOLS_NR; i++) { struct netmap_obj_pool *p = nmd->pools + i; *size += (p->_numclusters * p->_clustsize); } } } if (memflags) *memflags = nmd->flags; if (id) *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; } /* * we store objects by kernel address, need to find the offset * within the pool to export the value to userspace. * Algorithm: scan until we find the cluster, then add the * actual offset in the cluster */ static ssize_t netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) { int i, k = p->_clustentries, n = p->objtotal; ssize_t ofs = 0; for (i = 0; i < n; i += k, ofs += p->_clustsize) { const char *base = p->lut[i].vaddr; ssize_t relofs = (const char *) vaddr - base; if (relofs < 0 || relofs >= p->_clustsize) continue; ofs = ofs + relofs; ND("%s: return offset %d (cluster %d) for pointer %p", p->name, ofs, i, vaddr); return ofs; } D("address %p is not contained inside any cluster (%s)", vaddr, p->name); return 0; /* An error occurred */ } /* Helper functions which convert virtual addresses to offsets */ #define netmap_if_offset(n, v) \ netmap_obj_offset(&(n)->pools[NETMAP_IF_POOL], (v)) #define netmap_ring_offset(n, v) \ ((n)->pools[NETMAP_IF_POOL].memtotal + \ netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) static ssize_t netmap_mem2_if_offset(struct netmap_mem_d *nmd, const void *addr) { ssize_t v; NMA_LOCK(nmd); v = netmap_if_offset(nmd, addr); NMA_UNLOCK(nmd); return v; } /* * report the index, and use start position as a hint, * otherwise buffer allocation becomes terribly expensive. */ static void * netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ uint32_t mask, j = 0; /* slot counter */ void *vaddr = NULL; if (len > p->_objsize) { D("%s request size %d too large", p->name, len); // XXX cannot reduce the size return NULL; } if (p->objfree == 0) { D("no more %s objects", p->name); return NULL; } if (start) i = *start; /* termination is guaranteed by p->free, but better check bounds on i */ while (vaddr == NULL && i < p->bitmap_slots) { uint32_t cur = p->bitmap[i]; if (cur == 0) { /* bitmask is fully used */ i++; continue; } /* locate a slot */ for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; p->bitmap[i] &= ~mask; /* mark object as in use */ p->objfree--; vaddr = p->lut[i * 32 + j].vaddr; if (index) *index = i * 32 + j; } ND("%s allocator: allocated object @ [%d][%d]: vaddr %p",p->name, i, j, vaddr); if (start) *start = i; return vaddr; } /* * free by index, not by address. * XXX should we also cleanup the content ? */ static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { uint32_t *ptr, mask; if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); return 1; } ptr = &p->bitmap[j / 32]; mask = (1 << (j % 32)); if (*ptr & mask) { D("ouch, double free on buffer %d", j); return 1; } else { *ptr |= mask; p->objfree++; return 0; } } /* * free by address. This is slow but is only used for a few * objects (rings, nifp) */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { u_int i, j, n = p->numclusters; for (i = 0, j = 0; i < n; i++, j += p->_clustentries) { void *base = p->lut[i * p->_clustentries].vaddr; ssize_t relofs = (ssize_t) vaddr - (ssize_t) base; /* Given address, is out of the scope of the current cluster.*/ if (vaddr < base || relofs >= p->_clustsize) continue; j = j + relofs / p->_objsize; /* KASSERT(j != 0, ("Cannot free object 0")); */ netmap_obj_free(p, j); return; } D("address %p is not contained inside any cluster (%s)", vaddr, p->name); } #define netmap_mem_bufsize(n) \ ((n)->pools[NETMAP_BUF_POOL]._objsize) #define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL) #define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v)) #define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL) #define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v)) #define netmap_buf_malloc(n, _pos, _index) \ netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], netmap_mem_bufsize(n), _pos, _index) #if 0 // XXX unused /* Return the index associated to the given packet buffer */ #define netmap_buf_index(n, v) \ (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) #endif /* * allocate extra buffers in a linked list. * returns the actual number. */ uint32_t netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) { struct netmap_mem_d *nmd = na->nm_mem; uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ NMA_LOCK(nmd); *head = 0; /* default, 'null' index ie empty list */ for (i = 0 ; i < n; i++) { uint32_t cur = *head; /* save current head */ uint32_t *p = netmap_buf_malloc(nmd, &pos, head); if (p == NULL) { D("no more buffers after %d of %d", i, n); *head = cur; /* restore */ break; } ND(5, "allocate buffer %d -> %d", *head, cur); *p = cur; /* link to previous head */ } NMA_UNLOCK(nmd); return i; } static void netmap_extra_free(struct netmap_adapter *na, uint32_t head) { struct lut_entry *lut = na->na_lut.lut; struct netmap_mem_d *nmd = na->nm_mem; struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; uint32_t i, cur, *buf; ND("freeing the extra list"); for (i = 0; head >=2 && head < p->objtotal; i++) { cur = head; buf = lut[head].vaddr; head = *buf; *buf = 0; if (netmap_obj_free(p, cur)) break; } if (head != 0) D("breaking with head %d", head); if (netmap_verbose) D("freed %d buffers", i); } /* Return nonzero on error */ static int netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ for (i = 0; i < n; i++) { void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { D("no more buffers after %d of %d", i, n); goto cleanup; } slot[i].buf_idx = index; slot[i].len = p->_objsize; slot[i].flags = 0; } ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); return (0); cleanup: while (i > 0) { i--; netmap_obj_free(p, slot[i].buf_idx); } bzero(slot, n * sizeof(slot[0])); return (ENOMEM); } static void netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; u_int i; for (i = 0; i < n; i++) { slot[i].buf_idx = index; slot[i].len = p->_objsize; slot[i].flags = 0; } } static void netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; } netmap_obj_free(p, i); } static void netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { u_int i; for (i = 0; i < n; i++) { if (slot[i].buf_idx > 2) netmap_free_buf(nmd, slot[i].buf_idx); } } static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { if (p == NULL) return; if (p->bitmap) free(p->bitmap, M_NETMAP); p->bitmap = NULL; if (p->lut) { u_int i; /* * Free each cluster allocated in * netmap_finalize_obj_allocator(). The cluster start * addresses are stored at multiples of p->_clusterentries * in the lut. */ for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux vfree(p->lut); #else free(p->lut, M_NETMAP); #endif } p->lut = NULL; p->objtotal = 0; p->memtotal = 0; p->numclusters = 0; p->objfree = 0; } /* * Free all resources related to an allocator. */ static void netmap_destroy_obj_allocator(struct netmap_obj_pool *p) { if (p == NULL) return; netmap_reset_obj_allocator(p); } /* * We receive a request for objtotal objects, of size objsize each. * Internally we may round up both numbers, as we allocate objects * in small clusters multiple of the page size. * We need to keep track of objtotal and clustentries, * as they are needed when freeing memory. * * XXX note -- userspace needs the buffers to be contiguous, * so we cannot afford gaps at the end of a cluster. */ /* call with NMA_LOCK held */ static int netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize) { int i; u_int clustsize; /* the cluster size, multiple of page size */ u_int clustentries; /* how many objects per entry */ /* we store the current request, so we can * detect configuration changes later */ p->r_objtotal = objtotal; p->r_objsize = objsize; #define MAX_CLUSTSIZE (1<<22) // 4 MB #define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); return EINVAL; } /* make sure objsize is a multiple of LINE_ROUND */ i = (objsize & (LINE_ROUND - 1)); if (i) { D("XXX aligning object by %d bytes", LINE_ROUND - i); objsize += LINE_ROUND - i; } if (objsize < p->objminsize || objsize > p->objmaxsize) { D("requested objsize %d out of range [%d, %d]", objsize, p->objminsize, p->objmaxsize); return EINVAL; } if (objtotal < p->nummin || objtotal > p->nummax) { D("requested objtotal %d out of range [%d, %d]", objtotal, p->nummin, p->nummax); return EINVAL; } /* * Compute number of objects using a brute-force approach: * given a max cluster size, * we try to fill it with objects keeping track of the * wasted space to the next page boundary. */ for (clustentries = 0, i = 1;; i++) { u_int delta, used = i * objsize; if (used > MAX_CLUSTSIZE) break; delta = used % PAGE_SIZE; if (delta == 0) { // exact solution clustentries = i; break; } } /* exact solution not found */ if (clustentries == 0) { D("unsupported allocation for %d bytes", objsize); return EINVAL; } /* compute clustsize */ clustsize = clustentries * objsize; if (netmap_verbose) D("objsize %d clustsize %d objects %d", objsize, clustsize, clustentries); /* * The number of clusters is n = ceil(objtotal/clustentries) * objtotal' = n * clustentries */ p->_clustentries = clustentries; p->_clustsize = clustsize; p->_numclusters = (objtotal + clustentries - 1) / clustentries; /* actual values (may be larger than requested) */ p->_objsize = objsize; p->_objtotal = p->_numclusters * clustentries; return 0; } static struct lut_entry * nm_alloc_lut(u_int nobj) { size_t n = sizeof(struct lut_entry) * nobj; struct lut_entry *lut; #ifdef linux lut = vmalloc(n); #else lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); #endif return lut; } /* call with NMA_LOCK held */ static int netmap_finalize_obj_allocator(struct netmap_obj_pool *p) { int i; /* must be signed */ size_t n; /* optimistically assume we have enough memory */ p->numclusters = p->_numclusters; p->objtotal = p->_objtotal; p->lut = nm_alloc_lut(p->objtotal); if (p->lut == NULL) { D("Unable to create lookup table for '%s'", p->name); goto clean; } /* Allocate the bitmap */ n = (p->objtotal + 31) / 32; p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_NOWAIT | M_ZERO); if (p->bitmap == NULL) { D("Unable to create bitmap (%d entries) for allocator '%s'", (int)n, p->name); goto clean; } p->bitmap_slots = n; /* * Allocate clusters, init pointers and bitmap */ n = p->_clustsize; for (i = 0; i < (int)p->objtotal;) { int lim = i + p->_clustentries; char *clust; /* * XXX Note, we only need contigmalloc() for buffers attached * to native interfaces. In all other cases (nifp, netmap rings * and even buffers for VALE ports or emulated interfaces) we * can live with standard malloc, because the hardware will not * access the pages directly. */ clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { /* * If we get here, there is a severe memory shortage, * so halve the allocated memory to reclaim some. */ D("Unable to create cluster at %d for '%s' allocator", i, p->name); if (i < 2) /* nothing to halve */ goto out; lim = i / 2; for (i--; i >= lim; i--) { p->bitmap[ (i>>5) ] &= ~( 1 << (i & 31) ); if (i % p->_clustentries == 0 && p->lut[i].vaddr) contigfree(p->lut[i].vaddr, n, M_NETMAP); p->lut[i].vaddr = NULL; } out: p->objtotal = i; /* we may have stopped in the middle of a cluster */ p->numclusters = (i + p->_clustentries - 1) / p->_clustentries; break; } /* * Set bitmap and lut state for all buffers in the current * cluster. * * [i, lim) is the set of buffer indexes that cover the * current cluster. * * 'clust' is really the address of the current buffer in * the current cluster as we index through it with a stride * of p->_objsize. */ for (; i < lim; i++, clust += p->_objsize) { p->bitmap[ (i>>5) ] |= ( 1 << (i & 31) ); p->lut[i].vaddr = clust; p->lut[i].paddr = vtophys(clust); } } p->objfree = p->objtotal; p->memtotal = p->numclusters * p->_clustsize; if (p->objfree == 0) goto clean; if (netmap_verbose) D("Pre-allocated %d clusters (%d/%dKB) for '%s'", p->numclusters, p->_clustsize >> 10, p->memtotal >> 10, p->name); return 0; clean: netmap_reset_obj_allocator(p); return ENOMEM; } /* call with lock held */ static int netmap_memory_config_changed(struct netmap_mem_d *nmd) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { if (nmd->pools[i].r_objsize != netmap_params[i].size || nmd->pools[i].r_objtotal != netmap_params[i].num) return 1; } return 0; } static void netmap_mem_reset_all(struct netmap_mem_d *nmd) { int i; if (netmap_verbose) D("resetting %p", nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; } static int netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) { int i, lim = p->_objtotal; if (na->pdev == NULL) return 0; #if defined(__FreeBSD__) (void)i; (void)lim; D("unsupported on FreeBSD"); #elif defined(_WIN32) (void)i; (void)lim; D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ for (i = 2; i < lim; i++) { netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr); } #endif /* linux */ return 0; } static int netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) { #if defined(__FreeBSD__) D("unsupported on FreeBSD"); #elif defined(_WIN32) D("unsupported on Windows"); //XXX_ale, really? #else /* linux */ int i, lim = p->_objtotal; if (na->pdev == NULL) return 0; for (i = 2; i < lim; i++) { netmap_load_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr, p->lut[i].vaddr); } #endif /* linux */ return 0; } static int -netmap_mem_init_shared_info(struct netmap_mem_d *nmd) -{ - struct netmap_mem_shared_info *nms_info; - ssize_t base; - - /* Use the first slot in IF_POOL */ - nms_info = netmap_if_malloc(nmd, sizeof(*nms_info)); - if (nms_info == NULL) { - return ENOMEM; - } - - base = netmap_if_offset(nmd, nms_info); - - memcpy(&nms_info->up, &nms_if_blueprint, sizeof(nms_if_blueprint)); - nms_info->buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + nmd->pools[NETMAP_RING_POOL].memtotal; - nms_info->buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; - nms_info->buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; - nms_info->totalsize = nmd->nm_totalsize; - nms_info->features = NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE; - - return 0; -} - -static int netmap_mem_finalize_all(struct netmap_mem_d *nmd) { int i; if (nmd->flags & NETMAP_MEM_FINALIZED) return 0; nmd->lasterr = 0; nmd->nm_totalsize = 0; for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_finalize_obj_allocator(&nmd->pools[i]); if (nmd->lasterr) goto error; nmd->nm_totalsize += nmd->pools[i].memtotal; } /* buffers 0 and 1 are reserved */ nmd->pools[NETMAP_BUF_POOL].objfree -= 2; nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; - /* expose info to the ptnetmap guest */ - nmd->lasterr = netmap_mem_init_shared_info(nmd); - if (nmd->lasterr) - goto error; - if (netmap_verbose) D("interfaces %d KB, rings %d KB, buffers %d MB", nmd->pools[NETMAP_IF_POOL].memtotal >> 10, nmd->pools[NETMAP_RING_POOL].memtotal >> 10, nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); if (netmap_verbose) D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); return 0; error: netmap_mem_reset_all(nmd); return nmd->lasterr; } static void netmap_mem_private_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; if (netmap_verbose) D("deleting %p", nmd); if (nmd->active > 0) D("bug: deleting mem allocator with active=%d!", nmd->active); nm_mem_release_id(nmd); if (netmap_verbose) D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } static int netmap_mem_private_config(struct netmap_mem_d *nmd) { /* nothing to do, we are configured on creation * and configuration never changes thereafter */ return 0; } static int netmap_mem_private_finalize(struct netmap_mem_d *nmd) { int err; err = netmap_mem_finalize_all(nmd); if (!err) nmd->active++; return err; } static void netmap_mem_private_deref(struct netmap_mem_d *nmd) { if (--nmd->active <= 0) netmap_mem_reset_all(nmd); } /* * allocator for private memory */ struct netmap_mem_d * netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) { struct netmap_mem_d *d = NULL; struct netmap_obj_params p[NETMAP_POOLS_NR]; int i, err; u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), M_DEVBUF, M_NOWAIT | M_ZERO); if (d == NULL) { err = ENOMEM; goto error; } *d = nm_blueprint; err = nm_mem_assign_id(d); if (err) goto error; /* account for the fake host rings */ txr++; rxr++; /* copy the min values */ for (i = 0; i < NETMAP_POOLS_NR; i++) { p[i] = netmap_min_priv_params[i]; } /* possibly increase them to fit user request */ v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); if (p[NETMAP_IF_POOL].size < v) p[NETMAP_IF_POOL].size = v; v = 2 + 4 * npipes; if (p[NETMAP_IF_POOL].num < v) p[NETMAP_IF_POOL].num = v; maxd = (txd > rxd) ? txd : rxd; v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; if (p[NETMAP_RING_POOL].size < v) p[NETMAP_RING_POOL].size = v; /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) * and two rx rings (again, 1 normal and 1 fake host) */ v = txr + rxr + 8 * npipes; if (p[NETMAP_RING_POOL].num < v) p[NETMAP_RING_POOL].num = v; /* for each pipe we only need the buffers for the 4 "real" rings. * On the other end, the pipe ring dimension may be different from * the parent port ring dimension. As a compromise, we allocate twice the * space actually needed if the pipe rings were the same size as the parent rings */ v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ if (p[NETMAP_BUF_POOL].num < v) p[NETMAP_BUF_POOL].num = v; if (netmap_verbose) D("req if %d*%d ring %d*%d buf %d*%d", p[NETMAP_IF_POOL].num, p[NETMAP_IF_POOL].size, p[NETMAP_RING_POOL].num, p[NETMAP_RING_POOL].size, p[NETMAP_BUF_POOL].num, p[NETMAP_BUF_POOL].size); for (i = 0; i < NETMAP_POOLS_NR; i++) { snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, nm_blueprint.pools[i].name, name); err = netmap_config_obj_allocator(&d->pools[i], p[i].num, p[i].size); if (err) goto error; } d->flags &= ~NETMAP_MEM_FINALIZED; NMA_LOCK_INIT(d); return d; error: netmap_mem_private_delete(d); if (perr) *perr = err; return NULL; } /* call with lock held */ static int netmap_mem_global_config(struct netmap_mem_d *nmd) { int i; if (nmd->active) /* already in use, we cannot change the configuration */ goto out; if (!netmap_memory_config_changed(nmd)) goto out; ND("reconfiguring"); if (nmd->flags & NETMAP_MEM_FINALIZED) { /* reset previous allocation */ for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; } for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], netmap_params[i].num, netmap_params[i].size); if (nmd->lasterr) goto out; } out: return nmd->lasterr; } static int netmap_mem_global_finalize(struct netmap_mem_d *nmd) { int err; /* update configuration if changed */ if (netmap_mem_global_config(nmd)) return nmd->lasterr; nmd->active++; if (nmd->flags & NETMAP_MEM_FINALIZED) { /* may happen if config is not changed */ ND("nothing to do"); goto out; } if (netmap_mem_finalize_all(nmd)) goto out; nmd->lasterr = 0; out: if (nmd->lasterr) nmd->active--; err = nmd->lasterr; return err; } static void netmap_mem_global_delete(struct netmap_mem_d *nmd) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_destroy_obj_allocator(&nm_mem.pools[i]); } NMA_LOCK_DESTROY(&nm_mem); } int netmap_mem_init(void) { NMA_LOCK_INIT(&nm_mem); netmap_mem_get(&nm_mem); return (0); } void netmap_mem_fini(void) { netmap_mem_put(&nm_mem); } static void netmap_free_rings(struct netmap_adapter *na) { enum txrx t; for_rx_tx(t) { u_int i; for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; struct netmap_ring *ring = kring->ring; if (ring == NULL || kring->users > 0 || (kring->nr_kflags & NKR_NEEDRING)) { ND("skipping ring %s (ring %p, users %d)", kring->name, ring, kring->users); continue; } if (i != nma_get_nrings(na, t) || na->na_flags & NAF_HOST_RINGS) netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); netmap_ring_free(na->nm_mem, ring); kring->ring = NULL; } } } /* call with NMA_LOCK held * * * Allocate netmap rings and buffers for this card * The rings are contiguous, but have variable size. * The kring array must follow the layout described * in netmap_krings_create(). */ static int netmap_mem2_rings_create(struct netmap_adapter *na) { enum txrx t; NMA_LOCK(na->nm_mem); for_rx_tx(t) { u_int i; for (i = 0; i <= nma_get_nrings(na, t); i++) { struct netmap_kring *kring = &NMR(na, t)[i]; struct netmap_ring *ring = kring->ring; u_int len, ndesc; if (ring || (!kring->users && !(kring->nr_kflags & NKR_NEEDRING))) { /* uneeded, or already created by somebody else */ ND("skipping ring %s", kring->name); continue; } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { D("Cannot allocate %s_ring", nm_txrx2str(t)); goto cleanup; } ND("txring at %p", ring); kring->ring = ring; *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); /* copy values from kring */ ring->head = kring->rhead; ring->cur = kring->rcur; ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = netmap_mem_bufsize(na->nm_mem); ND("%s h %d c %d t %d", kring->name, ring->head, ring->cur, ring->tail); ND("initializing slots for %s_ring", nm_txrx2str(txrx)); if (i != nma_get_nrings(na, t) || (na->na_flags & NAF_HOST_RINGS)) { /* this is a real ring */ if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { D("Cannot allocate buffers for %s_ring", nm_txrx2str(t)); goto cleanup; } } else { /* this is a fake ring, set all indices to 0 */ netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } /* ring info */ *(uint16_t *)(uintptr_t)&ring->ringid = kring->ring_id; *(uint16_t *)(uintptr_t)&ring->dir = kring->tx; } } NMA_UNLOCK(na->nm_mem); return 0; cleanup: netmap_free_rings(na); NMA_UNLOCK(na->nm_mem); return ENOMEM; } static void netmap_mem2_rings_delete(struct netmap_adapter *na) { /* last instance, release bufs and rings */ NMA_LOCK(na->nm_mem); netmap_free_rings(na); NMA_UNLOCK(na->nm_mem); } /* call with NMA_LOCK held */ /* * Allocate the per-fd structure netmap_if. * * We assume that the configuration stored in na * (number of tx/rx rings and descs) does not change while * the interface is in netmap mode. */ static struct netmap_if * netmap_mem2_if_new(struct netmap_adapter *na) { struct netmap_if *nifp; ssize_t base; /* handy for relative offsets between rings and nifp */ u_int i, len, n[NR_TXRX], ntot; enum txrx t; ntot = 0; for_rx_tx(t) { /* account for the (eventually fake) host rings */ n[t] = nma_get_nrings(na, t) + 1; ntot += n[t]; } /* * the descriptor is followed inline by an array of offsets * to the tx and rx rings in the shared memory region. */ NMA_LOCK(na->nm_mem); len = sizeof(struct netmap_if) + (ntot * sizeof(ssize_t)); nifp = netmap_if_malloc(na->nm_mem, len); if (nifp == NULL) { NMA_UNLOCK(na->nm_mem); return NULL; } /* initialize base fields -- override const */ *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; strncpy(nifp->ni_name, na->name, (size_t)IFNAMSIZ); /* * fill the slots for the rx and tx rings. They contain the offset * between the ring and nifp, so the information is usable in * userspace to reach the ring from the nifp. */ base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < n[NR_TX]; i++) { if (na->tx_rings[i].ring == NULL) { // XXX maybe use the offset of an error ring, // like we do for buffers? *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = 0; continue; } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < n[NR_RX]; i++) { if (na->rx_rings[i].ring == NULL) { // XXX maybe use the offset of an error ring, // like we do for buffers? *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = 0; continue; } *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n[NR_TX]] = netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } NMA_UNLOCK(na->nm_mem); return (nifp); } static void netmap_mem2_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) { if (nifp == NULL) /* nothing to do */ return; NMA_LOCK(na->nm_mem); if (nifp->ni_bufs_head) netmap_extra_free(na, nifp->ni_bufs_head); netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); } static void netmap_mem_global_deref(struct netmap_mem_d *nmd) { nmd->active--; if (!nmd->active) nmd->nm_grp = -1; if (netmap_verbose) D("active = %d", nmd->active); } struct netmap_mem_ops netmap_mem_global_ops = { .nmd_get_lut = netmap_mem2_get_lut, .nmd_get_info = netmap_mem2_get_info, .nmd_ofstophys = netmap_mem2_ofstophys, .nmd_config = netmap_mem_global_config, .nmd_finalize = netmap_mem_global_finalize, .nmd_deref = netmap_mem_global_deref, .nmd_delete = netmap_mem_global_delete, .nmd_if_offset = netmap_mem2_if_offset, .nmd_if_new = netmap_mem2_if_new, .nmd_if_delete = netmap_mem2_if_delete, .nmd_rings_create = netmap_mem2_rings_create, .nmd_rings_delete = netmap_mem2_rings_delete }; struct netmap_mem_ops netmap_mem_private_ops = { .nmd_get_lut = netmap_mem2_get_lut, .nmd_get_info = netmap_mem2_get_info, .nmd_ofstophys = netmap_mem2_ofstophys, .nmd_config = netmap_mem_private_config, .nmd_finalize = netmap_mem_private_finalize, .nmd_deref = netmap_mem_private_deref, .nmd_if_offset = netmap_mem2_if_offset, .nmd_delete = netmap_mem_private_delete, .nmd_if_new = netmap_mem2_if_new, .nmd_if_delete = netmap_mem2_if_delete, .nmd_rings_create = netmap_mem2_rings_create, .nmd_rings_delete = netmap_mem2_rings_delete }; +int +netmap_mem_pools_info_get(struct nmreq *nmr, struct netmap_adapter *na) +{ + uintptr_t *pp = (uintptr_t *)&nmr->nr_arg1; + struct netmap_pools_info *upi = (struct netmap_pools_info *)(*pp); + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_pools_info pi; + unsigned int memsize; + uint16_t memid; + int ret; + + if (!nmd) { + return -1; + } + + ret = netmap_mem_get_info(nmd, &memsize, NULL, &memid); + if (ret) { + return ret; + } + + pi.memsize = memsize; + pi.memid = memid; + pi.if_pool_offset = 0; + pi.if_pool_objtotal = nmd->pools[NETMAP_IF_POOL].objtotal; + pi.if_pool_objsize = nmd->pools[NETMAP_IF_POOL]._objsize; + + pi.ring_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal; + pi.ring_pool_objtotal = nmd->pools[NETMAP_RING_POOL].objtotal; + pi.ring_pool_objsize = nmd->pools[NETMAP_RING_POOL]._objsize; + + pi.buf_pool_offset = nmd->pools[NETMAP_IF_POOL].memtotal + + nmd->pools[NETMAP_RING_POOL].memtotal; + pi.buf_pool_objtotal = nmd->pools[NETMAP_BUF_POOL].objtotal; + pi.buf_pool_objsize = nmd->pools[NETMAP_BUF_POOL]._objsize; + + ret = copyout(&pi, upi, sizeof(pi)); + if (ret) { + return ret; + } + + return 0; +} + #ifdef WITH_PTNETMAP_GUEST struct mem_pt_if { struct mem_pt_if *next; struct ifnet *ifp; unsigned int nifp_offset; - nm_pt_guest_ptctl_t ptctl; }; /* Netmap allocator for ptnetmap guests. */ struct netmap_mem_ptg { struct netmap_mem_d up; vm_paddr_t nm_paddr; /* physical address in the guest */ void *nm_addr; /* virtual address in the guest */ struct netmap_lut buf_lut; /* lookup table for BUF pool in the guest */ - nm_memid_t nm_host_id; /* allocator identifier in the host */ - struct ptnetmap_memdev *ptn_dev; + nm_memid_t host_mem_id; /* allocator identifier in the host */ + struct ptnetmap_memdev *ptn_dev;/* ptnetmap memdev */ struct mem_pt_if *pt_ifs; /* list of interfaces in passthrough */ }; /* Link a passthrough interface to a passthrough netmap allocator. */ static int netmap_mem_pt_guest_ifp_add(struct netmap_mem_d *nmd, struct ifnet *ifp, - unsigned int nifp_offset, - nm_pt_guest_ptctl_t ptctl) + unsigned int nifp_offset) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; struct mem_pt_if *ptif = malloc(sizeof(*ptif), M_NETMAP, M_NOWAIT | M_ZERO); if (!ptif) { return ENOMEM; } NMA_LOCK(nmd); ptif->ifp = ifp; ptif->nifp_offset = nifp_offset; - ptif->ptctl = ptctl; if (ptnmd->pt_ifs) { ptif->next = ptnmd->pt_ifs; } ptnmd->pt_ifs = ptif; NMA_UNLOCK(nmd); D("added (ifp=%p,nifp_offset=%u)", ptif->ifp, ptif->nifp_offset); return 0; } /* Called with NMA_LOCK(nmd) held. */ static struct mem_pt_if * netmap_mem_pt_guest_ifp_lookup(struct netmap_mem_d *nmd, struct ifnet *ifp) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; struct mem_pt_if *curr; for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { if (curr->ifp == ifp) { return curr; } } return NULL; } /* Unlink a passthrough interface from a passthrough netmap allocator. */ int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *nmd, struct ifnet *ifp) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; struct mem_pt_if *prev = NULL; struct mem_pt_if *curr; int ret = -1; NMA_LOCK(nmd); for (curr = ptnmd->pt_ifs; curr; curr = curr->next) { if (curr->ifp == ifp) { if (prev) { prev->next = curr->next; } else { ptnmd->pt_ifs = curr->next; } D("removed (ifp=%p,nifp_offset=%u)", curr->ifp, curr->nifp_offset); free(curr, M_NETMAP); ret = 0; break; } prev = curr; } NMA_UNLOCK(nmd); return ret; } -/* Read allocator info from the first netmap_if (only on finalize) */ static int -netmap_mem_pt_guest_read_shared_info(struct netmap_mem_d *nmd) -{ - struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; - struct netmap_mem_shared_info *nms_info; - uint32_t bufsize; - uint32_t nbuffers; - char *vaddr; - vm_paddr_t paddr; - int i; - - nms_info = (struct netmap_mem_shared_info *)ptnmd->nm_addr; - if (strncmp(nms_info->up.ni_name, NMS_NAME, sizeof(NMS_NAME)) != 0) { - D("error, the first slot does not contain shared info"); - return EINVAL; - } - /* check features mem_shared info */ - if ((nms_info->features & (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) != - (NMS_FEAT_BUF_POOL | NMS_FEAT_MEMSIZE)) { - D("error, the shared info does not contain BUF_POOL and MEMSIZE"); - return EINVAL; - } - - bufsize = nms_info->buf_pool_objsize; - nbuffers = nms_info->buf_pool_objtotal; - - /* allocate the lut */ - if (ptnmd->buf_lut.lut == NULL) { - D("allocating lut"); - ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers); - if (ptnmd->buf_lut.lut == NULL) { - D("lut allocation failed"); - return ENOMEM; - } - } - - /* we have physically contiguous memory mapped through PCI BAR */ - vaddr = (char *)(ptnmd->nm_addr) + nms_info->buf_pool_offset; - paddr = ptnmd->nm_paddr + nms_info->buf_pool_offset; - - for (i = 0; i < nbuffers; i++) { - ptnmd->buf_lut.lut[i].vaddr = vaddr; - ptnmd->buf_lut.lut[i].paddr = paddr; - vaddr += bufsize; - paddr += bufsize; - } - - ptnmd->buf_lut.objtotal = nbuffers; - ptnmd->buf_lut.objsize = bufsize; - - nmd->nm_totalsize = nms_info->totalsize; - - return 0; -} - -static int netmap_mem_pt_guest_get_lut(struct netmap_mem_d *nmd, struct netmap_lut *lut) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; if (!(nmd->flags & NETMAP_MEM_FINALIZED)) { return EINVAL; } *lut = ptnmd->buf_lut; return 0; } static int netmap_mem_pt_guest_get_info(struct netmap_mem_d *nmd, u_int *size, u_int *memflags, uint16_t *id) { int error = 0; NMA_LOCK(nmd); error = nmd->ops->nmd_config(nmd); if (error) goto out; if (size) *size = nmd->nm_totalsize; if (memflags) *memflags = nmd->flags; if (id) *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; } static vm_paddr_t netmap_mem_pt_guest_ofstophys(struct netmap_mem_d *nmd, vm_ooffset_t off) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; vm_paddr_t paddr; /* if the offset is valid, just return csb->base_addr + off */ paddr = (vm_paddr_t)(ptnmd->nm_paddr + off); ND("off %lx padr %lx", off, (unsigned long)paddr); return paddr; } static int netmap_mem_pt_guest_config(struct netmap_mem_d *nmd) { /* nothing to do, we are configured on creation * and configuration never changes thereafter */ return 0; } static int netmap_mem_pt_guest_finalize(struct netmap_mem_d *nmd) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; + uint64_t mem_size; + uint32_t bufsize; + uint32_t nbuffers; + uint32_t poolofs; + vm_paddr_t paddr; + char *vaddr; + int i; int error = 0; nmd->active++; if (nmd->flags & NETMAP_MEM_FINALIZED) goto out; if (ptnmd->ptn_dev == NULL) { D("ptnetmap memdev not attached"); error = ENOMEM; goto err; } - /* map memory through ptnetmap-memdev BAR */ + /* Map memory through ptnetmap-memdev BAR. */ error = nm_os_pt_memdev_iomap(ptnmd->ptn_dev, &ptnmd->nm_paddr, - &ptnmd->nm_addr); + &ptnmd->nm_addr, &mem_size); if (error) goto err; - /* read allcator info and create lut */ - error = netmap_mem_pt_guest_read_shared_info(nmd); - if (error) - goto err; + /* Initialize the lut using the information contained in the + * ptnetmap memory device. */ + bufsize = nm_os_pt_memdev_ioread(ptnmd->ptn_dev, + PTNET_MDEV_IO_BUF_POOL_OBJSZ); + nbuffers = nm_os_pt_memdev_ioread(ptnmd->ptn_dev, + PTNET_MDEV_IO_BUF_POOL_OBJNUM); + /* allocate the lut */ + if (ptnmd->buf_lut.lut == NULL) { + D("allocating lut"); + ptnmd->buf_lut.lut = nm_alloc_lut(nbuffers); + if (ptnmd->buf_lut.lut == NULL) { + D("lut allocation failed"); + return ENOMEM; + } + } + + /* we have physically contiguous memory mapped through PCI BAR */ + poolofs = nm_os_pt_memdev_ioread(ptnmd->ptn_dev, + PTNET_MDEV_IO_BUF_POOL_OFS); + vaddr = (char *)(ptnmd->nm_addr) + poolofs; + paddr = ptnmd->nm_paddr + poolofs; + + for (i = 0; i < nbuffers; i++) { + ptnmd->buf_lut.lut[i].vaddr = vaddr; + ptnmd->buf_lut.lut[i].paddr = paddr; + vaddr += bufsize; + paddr += bufsize; + } + + ptnmd->buf_lut.objtotal = nbuffers; + ptnmd->buf_lut.objsize = bufsize; + nmd->nm_totalsize = (unsigned int)mem_size; + nmd->flags |= NETMAP_MEM_FINALIZED; out: return 0; err: nmd->active--; return error; } static void netmap_mem_pt_guest_deref(struct netmap_mem_d *nmd) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; nmd->active--; if (nmd->active <= 0 && (nmd->flags & NETMAP_MEM_FINALIZED)) { nmd->flags &= ~NETMAP_MEM_FINALIZED; /* unmap ptnetmap-memdev memory */ if (ptnmd->ptn_dev) { nm_os_pt_memdev_iounmap(ptnmd->ptn_dev); } ptnmd->nm_addr = 0; ptnmd->nm_paddr = 0; } } static ssize_t netmap_mem_pt_guest_if_offset(struct netmap_mem_d *nmd, const void *vaddr) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)nmd; return (const char *)(vaddr) - (char *)(ptnmd->nm_addr); } static void netmap_mem_pt_guest_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; if (netmap_verbose) D("deleting %p", nmd); if (nmd->active > 0) D("bug: deleting mem allocator with active=%d!", nmd->active); nm_mem_release_id(nmd); if (netmap_verbose) D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } static struct netmap_if * netmap_mem_pt_guest_if_new(struct netmap_adapter *na) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; struct mem_pt_if *ptif; struct netmap_if *nifp = NULL; NMA_LOCK(na->nm_mem); ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); if (ptif == NULL) { D("Error: interface %p is not in passthrough", na->ifp); goto out; } nifp = (struct netmap_if *)((char *)(ptnmd->nm_addr) + ptif->nifp_offset); NMA_UNLOCK(na->nm_mem); out: return nifp; } static void netmap_mem_pt_guest_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) { struct mem_pt_if *ptif; NMA_LOCK(na->nm_mem); - ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); if (ptif == NULL) { D("Error: interface %p is not in passthrough", na->ifp); - goto out; } - - ptif->ptctl(na->ifp, PTNETMAP_PTCTL_IFDELETE); -out: NMA_UNLOCK(na->nm_mem); } static int netmap_mem_pt_guest_rings_create(struct netmap_adapter *na) { struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; struct mem_pt_if *ptif; struct netmap_if *nifp; int i, error = -1; NMA_LOCK(na->nm_mem); ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); if (ptif == NULL) { D("Error: interface %p is not in passthrough", na->ifp); goto out; } /* point each kring to the corresponding backend ring */ nifp = (struct netmap_if *)((char *)ptnmd->nm_addr + ptif->nifp_offset); for (i = 0; i <= na->num_tx_rings; i++) { struct netmap_kring *kring = na->tx_rings + i; if (kring->ring) continue; kring->ring = (struct netmap_ring *) ((char *)nifp + nifp->ring_ofs[i]); } for (i = 0; i <= na->num_rx_rings; i++) { struct netmap_kring *kring = na->rx_rings + i; if (kring->ring) continue; kring->ring = (struct netmap_ring *) ((char *)nifp + nifp->ring_ofs[i + na->num_tx_rings + 1]); } - //error = ptif->ptctl->nm_ptctl(ifp, PTNETMAP_PTCTL_RINGSCREATE); error = 0; out: NMA_UNLOCK(na->nm_mem); return error; } static void netmap_mem_pt_guest_rings_delete(struct netmap_adapter *na) { /* TODO: remove?? */ #if 0 struct netmap_mem_ptg *ptnmd = (struct netmap_mem_ptg *)na->nm_mem; struct mem_pt_if *ptif = netmap_mem_pt_guest_ifp_lookup(na->nm_mem, na->ifp); #endif } static struct netmap_mem_ops netmap_mem_pt_guest_ops = { .nmd_get_lut = netmap_mem_pt_guest_get_lut, .nmd_get_info = netmap_mem_pt_guest_get_info, .nmd_ofstophys = netmap_mem_pt_guest_ofstophys, .nmd_config = netmap_mem_pt_guest_config, .nmd_finalize = netmap_mem_pt_guest_finalize, .nmd_deref = netmap_mem_pt_guest_deref, .nmd_if_offset = netmap_mem_pt_guest_if_offset, .nmd_delete = netmap_mem_pt_guest_delete, .nmd_if_new = netmap_mem_pt_guest_if_new, .nmd_if_delete = netmap_mem_pt_guest_if_delete, .nmd_rings_create = netmap_mem_pt_guest_rings_create, .nmd_rings_delete = netmap_mem_pt_guest_rings_delete }; /* Called with NMA_LOCK(&nm_mem) held. */ static struct netmap_mem_d * -netmap_mem_pt_guest_find_hostid(nm_memid_t host_id) +netmap_mem_pt_guest_find_memid(nm_memid_t mem_id) { struct netmap_mem_d *mem = NULL; struct netmap_mem_d *scan = netmap_last_mem_d; do { /* find ptnetmap allocator through host ID */ if (scan->ops->nmd_deref == netmap_mem_pt_guest_deref && - ((struct netmap_mem_ptg *)(scan))->nm_host_id == host_id) { + ((struct netmap_mem_ptg *)(scan))->host_mem_id == mem_id) { mem = scan; break; } scan = scan->next; } while (scan != netmap_last_mem_d); return mem; } /* Called with NMA_LOCK(&nm_mem) held. */ static struct netmap_mem_d * -netmap_mem_pt_guest_create(nm_memid_t host_id) +netmap_mem_pt_guest_create(nm_memid_t mem_id) { struct netmap_mem_ptg *ptnmd; int err = 0; ptnmd = malloc(sizeof(struct netmap_mem_ptg), M_DEVBUF, M_NOWAIT | M_ZERO); if (ptnmd == NULL) { err = ENOMEM; goto error; } ptnmd->up.ops = &netmap_mem_pt_guest_ops; - ptnmd->nm_host_id = host_id; + ptnmd->host_mem_id = mem_id; ptnmd->pt_ifs = NULL; /* Assign new id in the guest (We have the lock) */ err = nm_mem_assign_id_locked(&ptnmd->up); if (err) goto error; ptnmd->up.flags &= ~NETMAP_MEM_FINALIZED; ptnmd->up.flags |= NETMAP_MEM_IO; NMA_LOCK_INIT(&ptnmd->up); return &ptnmd->up; error: netmap_mem_pt_guest_delete(&ptnmd->up); return NULL; } /* * find host id in guest allocators and create guest allocator * if it is not there */ static struct netmap_mem_d * -netmap_mem_pt_guest_get(nm_memid_t host_id) +netmap_mem_pt_guest_get(nm_memid_t mem_id) { struct netmap_mem_d *nmd; NMA_LOCK(&nm_mem); - nmd = netmap_mem_pt_guest_find_hostid(host_id); + nmd = netmap_mem_pt_guest_find_memid(mem_id); if (nmd == NULL) { - nmd = netmap_mem_pt_guest_create(host_id); + nmd = netmap_mem_pt_guest_create(mem_id); } NMA_UNLOCK(&nm_mem); return nmd; } /* * The guest allocator can be created by ptnetmap_memdev (during the device - * attach) or by ptnetmap device (e1000/virtio), during the netmap_attach. + * attach) or by ptnetmap device (ptnet), during the netmap_attach. * * The order is not important (we have different order in LINUX and FreeBSD). * The first one, creates the device, and the second one simply attaches it. */ /* Called when ptnetmap_memdev is attaching, to attach a new allocator in * the guest */ struct netmap_mem_d * -netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t host_id) +netmap_mem_pt_guest_attach(struct ptnetmap_memdev *ptn_dev, nm_memid_t mem_id) { struct netmap_mem_d *nmd; struct netmap_mem_ptg *ptnmd; - nmd = netmap_mem_pt_guest_get(host_id); + nmd = netmap_mem_pt_guest_get(mem_id); /* assign this device to the guest allocator */ if (nmd) { ptnmd = (struct netmap_mem_ptg *)nmd; ptnmd->ptn_dev = ptn_dev; } return nmd; } -/* Called when ptnetmap device (virtio/e1000) is attaching */ +/* Called when ptnet device is attaching */ struct netmap_mem_d * netmap_mem_pt_guest_new(struct ifnet *ifp, unsigned int nifp_offset, - nm_pt_guest_ptctl_t ptctl) + unsigned int memid) { struct netmap_mem_d *nmd; - nm_memid_t host_id; - if (ifp == NULL || ptctl == NULL) { + if (ifp == NULL) { return NULL; } - /* Get the host id allocator. */ - host_id = ptctl(ifp, PTNETMAP_PTCTL_HOSTMEMID); + nmd = netmap_mem_pt_guest_get((nm_memid_t)memid); - nmd = netmap_mem_pt_guest_get(host_id); - if (nmd) { - netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset, - ptctl); + netmap_mem_pt_guest_ifp_add(nmd, ifp, nifp_offset); } return nmd; } #endif /* WITH_PTNETMAP_GUEST */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_mem2.h (revision 308054) @@ -1,181 +1,183 @@ /* * Copyright (C) 2012-2014 Matteo Landi * Copyright (C) 2012-2016 Luigi Rizzo * Copyright (C) 2012-2016 Giuseppe Lettieri * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * (New) memory allocator for netmap */ /* * This allocator creates three memory pools: * nm_if_pool for the struct netmap_if * nm_ring_pool for the struct netmap_ring * nm_buf_pool for the packet buffers. * * that contain netmap objects. Each pool is made of a number of clusters, * multiple of a page size, each containing an integer number of objects. * The clusters are contiguous in user space but not in the kernel. * Only nm_buf_pool needs to be dma-able, * but for convenience use the same type of allocator for all. * * Once mapped, the three pools are exported to userspace * as a contiguous block, starting from nm_if_pool. Each * cluster (and pool) is an integral number of pages. * [ . . . ][ . . . . . .][ . . . . . . . . . .] * nm_if nm_ring nm_buf * * The userspace areas contain offsets of the objects in userspace. * When (at init time) we write these offsets, we find out the index * of the object, and from there locate the offset from the beginning * of the region. * * The invididual allocators manage a pool of memory for objects of * the same size. * The pool is split into smaller clusters, whose size is a * multiple of the page size. The cluster size is chosen * to minimize the waste for a given max cluster size * (we do it by brute force, as we have relatively few objects * per cluster). * * Objects are aligned to the cache line (64 bytes) rounding up object * sizes when needed. A bitmap contains the state of each object. * Allocation scans the bitmap; this is done only on attach, so we are not * too worried about performance * * For each allocator we can define (thorugh sysctl) the size and * number of each object. Memory is allocated at the first use of a * netmap file descriptor, and can be freed when all such descriptors * have been released (including unmapping the memory). * If memory is scarce, the system tries to get as much as possible * and the sysctl values reflect the actual allocation. * Together with desired values, the sysctl export also absolute * min and maximum values that cannot be overridden. * * struct netmap_if: * variable size, max 16 bytes per ring pair plus some fixed amount. * 1024 bytes should be large enough in practice. * * In the worst case we have one netmap_if per ring in the system. * * struct netmap_ring * variable size, 8 byte per slot plus some fixed amount. * Rings can be large (e.g. 4k slots, or >32Kbytes). * We default to 36 KB (9 pages), and a few hundred rings. * * struct netmap_buffer * The more the better, both because fast interfaces tend to have * many slots, and because we may want to use buffers to store * packets in userspace avoiding copies. * Must contain a full frame (eg 1518, or more for vlans, jumbo * frames etc.) plus be nicely aligned, plus some NICs restrict * the size to multiple of 1K or so. Default to 2K */ #ifndef _NET_NETMAP_MEM2_H_ #define _NET_NETMAP_MEM2_H_ /* We implement two kinds of netmap_mem_d structures: * * - global: used by hardware NICS; * * - private: used by VALE ports. * * In both cases, the netmap_mem_d structure has the same lifetime as the * netmap_adapter of the corresponding NIC or port. It is the responsibility of * the client code to delete the private allocator when the associated * netmap_adapter is freed (this is implemented by the NAF_MEM_OWNER flag in * netmap.c). The 'refcount' field counts the number of active users of the * structure. The global allocator uses this information to prevent/allow * reconfiguration. The private allocators release all their memory when there * are no active users. By 'active user' we mean an existing netmap_priv * structure holding a reference to the allocator. */ extern struct netmap_mem_d nm_mem; int netmap_mem_get_lut(struct netmap_mem_d *, struct netmap_lut *); vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); #ifdef _WIN32 PMDL win32_build_user_vm_map(struct netmap_mem_d* nmd); #endif int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem_init(void); void netmap_mem_fini(void); struct netmap_if * netmap_mem_if_new(struct netmap_adapter *); void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem2_get_pool_info(struct netmap_mem_d *, u_int, u_int *, u_int *); int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); struct netmap_mem_d* netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int* error); void netmap_mem_delete(struct netmap_mem_d *); //#define NM_DEBUG_MEM_PUTGET 1 #ifdef NM_DEBUG_MEM_PUTGET #define netmap_mem_get(nmd) \ do { \ __netmap_mem_get(nmd, __FUNCTION__, __LINE__); \ } while (0) #define netmap_mem_put(nmd) \ do { \ __netmap_mem_put(nmd, __FUNCTION__, __LINE__); \ } while (0) void __netmap_mem_get(struct netmap_mem_d *, const char *, int); void __netmap_mem_put(struct netmap_mem_d *, const char *, int); #else /* !NM_DEBUG_MEM_PUTGET */ void netmap_mem_get(struct netmap_mem_d *); void netmap_mem_put(struct netmap_mem_d *); #endif /* !NM_DEBUG_PUTGET */ #ifdef WITH_PTNETMAP_GUEST struct netmap_mem_d* netmap_mem_pt_guest_new(struct ifnet *, unsigned int nifp_offset, - nm_pt_guest_ptctl_t); + unsigned int memid); struct ptnetmap_memdev; struct netmap_mem_d* netmap_mem_pt_guest_attach(struct ptnetmap_memdev *, uint16_t); int netmap_mem_pt_guest_ifp_del(struct netmap_mem_d *, struct ifnet *); #endif /* WITH_PTNETMAP_GUEST */ + +int netmap_mem_pools_info_get(struct nmreq *, struct netmap_adapter *); #define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */ #define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */ uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); #endif Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_pt.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_pt.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_pt.c (revision 308054) @@ -1,1438 +1,1453 @@ /* * Copyright (C) 2015 Stefano Garzarella * Copyright (C) 2016 Vincenzo Maffione * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * common headers */ #if defined(__FreeBSD__) #include #include #include #include #include #include #include #include #include //#define usleep_range(_1, _2) #define usleep_range(_1, _2) \ pause_sbt("ptnetmap-sleep", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE) #elif defined(linux) #include #endif #include #include #include #include #ifdef WITH_PTNETMAP_HOST /* RX cycle without receive any packets */ #define PTN_RX_DRY_CYCLES_MAX 10 /* Limit Batch TX to half ring. * Currently disabled, since it does not manage NS_MOREFRAG, which * results in random drops in the VALE txsync. */ //#define PTN_TX_BATCH_LIM(_n) ((_n >> 1)) //#define BUSY_WAIT #define NETMAP_PT_DEBUG /* Enables communication debugging. */ #ifdef NETMAP_PT_DEBUG #define DBG(x) x #else #define DBG(x) #endif #undef RATE //#define RATE /* Enables communication statistics. */ #ifdef RATE #define IFRATE(x) x struct rate_batch_stats { unsigned long sync; unsigned long sync_dry; unsigned long pkt; }; struct rate_stats { unsigned long gtxk; /* Guest --> Host Tx kicks. */ unsigned long grxk; /* Guest --> Host Rx kicks. */ unsigned long htxk; /* Host --> Guest Tx kicks. */ unsigned long hrxk; /* Host --> Guest Rx Kicks. */ unsigned long btxwu; /* Backend Tx wake-up. */ unsigned long brxwu; /* Backend Rx wake-up. */ struct rate_batch_stats txbs; struct rate_batch_stats rxbs; }; struct rate_context { struct timer_list timer; struct rate_stats new; struct rate_stats old; }; #define RATE_PERIOD 2 static void rate_callback(unsigned long arg) { struct rate_context * ctx = (struct rate_context *)arg; struct rate_stats cur = ctx->new; struct rate_batch_stats *txbs = &cur.txbs; struct rate_batch_stats *rxbs = &cur.rxbs; struct rate_batch_stats *txbs_old = &ctx->old.txbs; struct rate_batch_stats *rxbs_old = &ctx->old.rxbs; uint64_t tx_batch, rx_batch; unsigned long txpkts, rxpkts; unsigned long gtxk, grxk; int r; txpkts = txbs->pkt - txbs_old->pkt; rxpkts = rxbs->pkt - rxbs_old->pkt; tx_batch = ((txbs->sync - txbs_old->sync) > 0) ? txpkts / (txbs->sync - txbs_old->sync): 0; rx_batch = ((rxbs->sync - rxbs_old->sync) > 0) ? rxpkts / (rxbs->sync - rxbs_old->sync): 0; /* Fix-up gtxk and grxk estimates. */ gtxk = (cur.gtxk - ctx->old.gtxk) - (cur.btxwu - ctx->old.btxwu); grxk = (cur.grxk - ctx->old.grxk) - (cur.brxwu - ctx->old.brxwu); printk("txpkts = %lu Hz\n", txpkts/RATE_PERIOD); printk("gtxk = %lu Hz\n", gtxk/RATE_PERIOD); printk("htxk = %lu Hz\n", (cur.htxk - ctx->old.htxk)/RATE_PERIOD); printk("btxw = %lu Hz\n", (cur.btxwu - ctx->old.btxwu)/RATE_PERIOD); printk("rxpkts = %lu Hz\n", rxpkts/RATE_PERIOD); printk("grxk = %lu Hz\n", grxk/RATE_PERIOD); printk("hrxk = %lu Hz\n", (cur.hrxk - ctx->old.hrxk)/RATE_PERIOD); printk("brxw = %lu Hz\n", (cur.brxwu - ctx->old.brxwu)/RATE_PERIOD); printk("txbatch = %llu avg\n", tx_batch); printk("rxbatch = %llu avg\n", rx_batch); printk("\n"); ctx->old = cur; r = mod_timer(&ctx->timer, jiffies + msecs_to_jiffies(RATE_PERIOD * 1000)); if (unlikely(r)) D("[ptnetmap] Error: mod_timer()\n"); } static void rate_batch_stats_update(struct rate_batch_stats *bf, uint32_t pre_tail, uint32_t act_tail, uint32_t num_slots) { int n = (int)act_tail - pre_tail; if (n) { if (n < 0) n += num_slots; bf->sync++; bf->pkt += n; } else { bf->sync_dry++; } } #else /* !RATE */ #define IFRATE(x) #endif /* RATE */ struct ptnetmap_state { /* Kthreads. */ struct nm_kthread **kthreads; /* Shared memory with the guest (TX/RX) */ struct ptnet_ring __user *ptrings; bool stopped; /* Netmap adapter wrapping the backend. */ struct netmap_pt_host_adapter *pth_na; IFRATE(struct rate_context rate_ctx;) }; static inline void ptnetmap_kring_dump(const char *title, const struct netmap_kring *kring) { RD(1, "%s - name: %s hwcur: %d hwtail: %d rhead: %d rcur: %d \ rtail: %d head: %d cur: %d tail: %d", title, kring->name, kring->nr_hwcur, kring->nr_hwtail, kring->rhead, kring->rcur, kring->rtail, kring->ring->head, kring->ring->cur, kring->ring->tail); } /* * TX functions to set/get and to handle host/guest kick. */ /* Enable or disable guest --> host kicks. */ static inline void ptring_kick_enable(struct ptnet_ring __user *ptring, uint32_t val) { CSB_WRITE(ptring, host_need_kick, val); } /* Are guest interrupt enabled or disabled? */ static inline uint32_t ptring_intr_enabled(struct ptnet_ring __user *ptring) { uint32_t v; CSB_READ(ptring, guest_need_kick, v); return v; } /* Enable or disable guest interrupts. */ static inline void ptring_intr_enable(struct ptnet_ring __user *ptring, uint32_t val) { CSB_WRITE(ptring, guest_need_kick, val); } /* Handle TX events: from the guest or from the backend */ static void ptnetmap_tx_handler(void *data) { struct netmap_kring *kring = data; struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)kring->na->na_private; struct ptnetmap_state *ptns = pth_na->ptns; struct ptnet_ring __user *ptring; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ bool more_txspace = false; struct nm_kthread *kth; uint32_t num_slots; int batch; IFRATE(uint32_t pre_tail); if (unlikely(!ptns)) { D("ERROR ptnetmap state is NULL"); return; } if (unlikely(ptns->stopped)) { RD(1, "backend netmap is being stopped"); return; } if (unlikely(nm_kr_tryget(kring, 1, NULL))) { D("ERROR nm_kr_tryget()"); return; } /* This is a guess, to be fixed in the rate callback. */ IFRATE(ptns->rate_ctx.new.gtxk++); /* Get TX ptring pointer from the CSB. */ ptring = ptns->ptrings + kring->ring_id; kth = ptns->kthreads[kring->ring_id]; num_slots = kring->nkr_num_slots; shadow_ring.head = kring->rhead; shadow_ring.cur = kring->rcur; /* Disable guest --> host notifications. */ ptring_kick_enable(ptring, 0); /* Copy the guest kring pointers from the CSB */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); for (;;) { /* If guest moves ahead too fast, let's cut the move so * that we don't exceed our batch limit. */ batch = shadow_ring.head - kring->nr_hwcur; if (batch < 0) batch += num_slots; #ifdef PTN_TX_BATCH_LIM if (batch > PTN_TX_BATCH_LIM(num_slots)) { uint32_t head_lim = kring->nr_hwcur + PTN_TX_BATCH_LIM(num_slots); if (head_lim >= num_slots) head_lim -= num_slots; ND(1, "batch: %d head: %d head_lim: %d", batch, shadow_ring.head, head_lim); shadow_ring.head = head_lim; batch = PTN_TX_BATCH_LIM(num_slots); } #endif /* PTN_TX_BATCH_LIM */ if (nm_kr_txspace(kring) <= (num_slots >> 1)) { shadow_ring.flags |= NAF_FORCE_RECLAIM; } /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_txsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); ptring_kick_enable(ptring, 1); break; } if (unlikely(netmap_verbose & NM_VERB_TXSYNC)) { ptnetmap_kring_dump("pre txsync", kring); } IFRATE(pre_tail = kring->rtail); if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { /* Reenable notifications. */ ptring_kick_enable(ptring, 1); D("ERROR txsync()"); break; } /* * Finalize * Copy host hwcur and hwtail into the CSB for the guest sync(), and * do the nm_sync_finalize. */ ptnetmap_host_write_kring_csb(ptring, kring->nr_hwcur, kring->nr_hwtail); if (kring->rtail != kring->nr_hwtail) { /* Some more room available in the parent adapter. */ kring->rtail = kring->nr_hwtail; more_txspace = true; } IFRATE(rate_batch_stats_update(&ptns->rate_ctx.new.txbs, pre_tail, kring->rtail, num_slots)); if (unlikely(netmap_verbose & NM_VERB_TXSYNC)) { ptnetmap_kring_dump("post txsync", kring); } #ifndef BUSY_WAIT /* Interrupt the guest if needed. */ if (more_txspace && ptring_intr_enabled(ptring)) { /* Disable guest kick to avoid sending unnecessary kicks */ ptring_intr_enable(ptring, 0); nm_os_kthread_send_irq(kth); IFRATE(ptns->rate_ctx.new.htxk++); more_txspace = false; } #endif /* Read CSB to see if there is more work to do. */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); #ifndef BUSY_WAIT if (shadow_ring.head == kring->rhead) { /* * No more packets to transmit. We enable notifications and * go to sleep, waiting for a kick from the guest when new * new slots are ready for transmission. */ usleep_range(1,1); /* Reenable notifications. */ ptring_kick_enable(ptring, 1); /* Doublecheck. */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); if (shadow_ring.head != kring->rhead) { /* We won the race condition, there are more packets to * transmit. Disable notifications and do another cycle */ ptring_kick_enable(ptring, 0); continue; } break; } if (nm_kr_txempty(kring)) { /* No more available TX slots. We stop waiting for a notification * from the backend (netmap_tx_irq). */ ND(1, "TX ring"); break; } #endif if (unlikely(ptns->stopped)) { D("backend netmap is being stopped"); break; } } nm_kr_put(kring); if (more_txspace && ptring_intr_enabled(ptring)) { ptring_intr_enable(ptring, 0); nm_os_kthread_send_irq(kth); IFRATE(ptns->rate_ctx.new.htxk++); } } /* * We need RX kicks from the guest when (tail == head-1), where we wait * for the guest to refill. */ #ifndef BUSY_WAIT static inline int ptnetmap_norxslots(struct netmap_kring *kring, uint32_t g_head) { return (NM_ACCESS_ONCE(kring->nr_hwtail) == nm_prev(g_head, kring->nkr_num_slots - 1)); } #endif /* !BUSY_WAIT */ /* Handle RX events: from the guest or from the backend */ static void ptnetmap_rx_handler(void *data) { struct netmap_kring *kring = data; struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)kring->na->na_private; struct ptnetmap_state *ptns = pth_na->ptns; struct ptnet_ring __user *ptring; struct netmap_ring shadow_ring; /* shadow copy of the netmap_ring */ struct nm_kthread *kth; uint32_t num_slots; int dry_cycles = 0; bool some_recvd = false; IFRATE(uint32_t pre_tail); if (unlikely(!ptns || !ptns->pth_na)) { D("ERROR ptnetmap state %p, ptnetmap host adapter %p", ptns, ptns ? ptns->pth_na : NULL); return; } if (unlikely(ptns->stopped)) { RD(1, "backend netmap is being stopped"); return; } if (unlikely(nm_kr_tryget(kring, 1, NULL))) { D("ERROR nm_kr_tryget()"); return; } /* This is a guess, to be fixed in the rate callback. */ IFRATE(ptns->rate_ctx.new.grxk++); /* Get RX ptring pointer from the CSB. */ ptring = ptns->ptrings + (pth_na->up.num_tx_rings + kring->ring_id); kth = ptns->kthreads[pth_na->up.num_tx_rings + kring->ring_id]; num_slots = kring->nkr_num_slots; shadow_ring.head = kring->rhead; shadow_ring.cur = kring->rcur; /* Disable notifications. */ ptring_kick_enable(ptring, 0); /* Copy the guest kring pointers from the CSB */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); for (;;) { uint32_t hwtail; /* Netmap prologue */ shadow_ring.tail = kring->rtail; if (unlikely(nm_rxsync_prologue(kring, &shadow_ring) >= num_slots)) { /* Reinit ring and enable notifications. */ netmap_ring_reinit(kring); ptring_kick_enable(ptring, 1); break; } if (unlikely(netmap_verbose & NM_VERB_RXSYNC)) { ptnetmap_kring_dump("pre rxsync", kring); } IFRATE(pre_tail = kring->rtail); if (unlikely(kring->nm_sync(kring, shadow_ring.flags))) { /* Reenable notifications. */ ptring_kick_enable(ptring, 1); D("ERROR rxsync()"); break; } /* * Finalize * Copy host hwcur and hwtail into the CSB for the guest sync() */ hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); ptnetmap_host_write_kring_csb(ptring, kring->nr_hwcur, hwtail); if (kring->rtail != hwtail) { kring->rtail = hwtail; some_recvd = true; dry_cycles = 0; } else { dry_cycles++; } IFRATE(rate_batch_stats_update(&ptns->rate_ctx.new.rxbs, pre_tail, kring->rtail, num_slots)); if (unlikely(netmap_verbose & NM_VERB_RXSYNC)) { ptnetmap_kring_dump("post rxsync", kring); } #ifndef BUSY_WAIT /* Interrupt the guest if needed. */ if (some_recvd && ptring_intr_enabled(ptring)) { /* Disable guest kick to avoid sending unnecessary kicks */ ptring_intr_enable(ptring, 0); nm_os_kthread_send_irq(kth); IFRATE(ptns->rate_ctx.new.hrxk++); some_recvd = false; } #endif /* Read CSB to see if there is more work to do. */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); #ifndef BUSY_WAIT if (ptnetmap_norxslots(kring, shadow_ring.head)) { /* * No more slots available for reception. We enable notification and * go to sleep, waiting for a kick from the guest when new receive * slots are available. */ usleep_range(1,1); /* Reenable notifications. */ ptring_kick_enable(ptring, 1); /* Doublecheck. */ ptnetmap_host_read_kring_csb(ptring, &shadow_ring, num_slots); if (!ptnetmap_norxslots(kring, shadow_ring.head)) { /* We won the race condition, more slots are available. Disable * notifications and do another cycle. */ ptring_kick_enable(ptring, 0); continue; } break; } hwtail = NM_ACCESS_ONCE(kring->nr_hwtail); if (unlikely(hwtail == kring->rhead || dry_cycles >= PTN_RX_DRY_CYCLES_MAX)) { /* No more packets to be read from the backend. We stop and * wait for a notification from the backend (netmap_rx_irq). */ ND(1, "nr_hwtail: %d rhead: %d dry_cycles: %d", hwtail, kring->rhead, dry_cycles); break; } #endif if (unlikely(ptns->stopped)) { D("backend netmap is being stopped"); break; } } nm_kr_put(kring); /* Interrupt the guest if needed. */ if (some_recvd && ptring_intr_enabled(ptring)) { ptring_intr_enable(ptring, 0); nm_os_kthread_send_irq(kth); IFRATE(ptns->rate_ctx.new.hrxk++); } } #ifdef NETMAP_PT_DEBUG static void ptnetmap_print_configuration(struct ptnetmap_cfg *cfg) { int k; - D("[PTN] configuration:"); - D(" CSB ptrings @%p, num_rings=%u, features %08x", cfg->ptrings, - cfg->num_rings, cfg->features); + D("ptnetmap configuration:"); + D(" CSB ptrings @%p, num_rings=%u, cfgtype %08x", cfg->ptrings, + cfg->num_rings, cfg->cfgtype); for (k = 0; k < cfg->num_rings; k++) { - D(" ring #%d: iofd=%llu, irqfd=%llu", k, - (unsigned long long)cfg->entries[k].ioeventfd, - (unsigned long long)cfg->entries[k].irqfd); + switch (cfg->cfgtype) { + case PTNETMAP_CFGTYPE_QEMU: { + struct ptnetmap_cfgentry_qemu *e = + (struct ptnetmap_cfgentry_qemu *)(cfg+1) + k; + D(" ring #%d: ioeventfd=%lu, irqfd=%lu", k, + (unsigned long)e->ioeventfd, + (unsigned long)e->irqfd); + break; + } + + case PTNETMAP_CFGTYPE_BHYVE: + { + struct ptnetmap_cfgentry_bhyve *e = + (struct ptnetmap_cfgentry_bhyve *)(cfg+1) + k; + D(" ring #%d: wchan=%lu, ioctl_fd=%lu, " + "ioctl_cmd=%lu, msix_msg_data=%lu, msix_addr=%lu", + k, (unsigned long)e->wchan, + (unsigned long)e->ioctl_fd, + (unsigned long)e->ioctl_cmd, + (unsigned long)e->ioctl_data.msg_data, + (unsigned long)e->ioctl_data.addr); + break; + } + } } } #endif /* NETMAP_PT_DEBUG */ /* Copy actual state of the host ring into the CSB for the guest init */ static int ptnetmap_kring_snapshot(struct netmap_kring *kring, struct ptnet_ring __user *ptring) { if(CSB_WRITE(ptring, head, kring->rhead)) goto err; if(CSB_WRITE(ptring, cur, kring->rcur)) goto err; if(CSB_WRITE(ptring, hwcur, kring->nr_hwcur)) goto err; if(CSB_WRITE(ptring, hwtail, NM_ACCESS_ONCE(kring->nr_hwtail))) goto err; DBG(ptnetmap_kring_dump("ptnetmap_kring_snapshot", kring);) return 0; err: return EFAULT; } static struct netmap_kring * ptnetmap_kring(struct netmap_pt_host_adapter *pth_na, int k) { if (k < pth_na->up.num_tx_rings) { return pth_na->up.tx_rings + k; } return pth_na->up.rx_rings + k - pth_na->up.num_tx_rings; } static int ptnetmap_krings_snapshot(struct netmap_pt_host_adapter *pth_na) { struct ptnetmap_state *ptns = pth_na->ptns; struct netmap_kring *kring; unsigned int num_rings; int err = 0, k; num_rings = pth_na->up.num_tx_rings + pth_na->up.num_rx_rings; for (k = 0; k < num_rings; k++) { kring = ptnetmap_kring(pth_na, k); err |= ptnetmap_kring_snapshot(kring, ptns->ptrings + k); } return err; } /* * Functions to create, start and stop the kthreads */ static int ptnetmap_create_kthreads(struct netmap_pt_host_adapter *pth_na, struct ptnetmap_cfg *cfg) { struct ptnetmap_state *ptns = pth_na->ptns; struct nm_kthread_cfg nmk_cfg; unsigned int num_rings; + uint8_t *cfg_entries = (uint8_t *)(cfg + 1); int k; num_rings = pth_na->up.num_tx_rings + pth_na->up.num_rx_rings; for (k = 0; k < num_rings; k++) { nmk_cfg.attach_user = 1; /* attach kthread to user process */ nmk_cfg.worker_private = ptnetmap_kring(pth_na, k); - nmk_cfg.event = *(cfg->entries + k); nmk_cfg.type = k; if (k < pth_na->up.num_tx_rings) { nmk_cfg.worker_fn = ptnetmap_tx_handler; } else { nmk_cfg.worker_fn = ptnetmap_rx_handler; } - ptns->kthreads[k] = nm_os_kthread_create(&nmk_cfg); + ptns->kthreads[k] = nm_os_kthread_create(&nmk_cfg, + cfg->cfgtype, cfg_entries + k * cfg->entry_size); if (ptns->kthreads[k] == NULL) { goto err; } } return 0; err: for (k = 0; k < num_rings; k++) { if (ptns->kthreads[k]) { nm_os_kthread_delete(ptns->kthreads[k]); ptns->kthreads[k] = NULL; } } return EFAULT; } static int ptnetmap_start_kthreads(struct netmap_pt_host_adapter *pth_na) { struct ptnetmap_state *ptns = pth_na->ptns; int num_rings; int error; int k; if (!ptns) { D("BUG ptns is NULL"); return EFAULT; } ptns->stopped = false; num_rings = ptns->pth_na->up.num_tx_rings + ptns->pth_na->up.num_rx_rings; for (k = 0; k < num_rings; k++) { //nm_os_kthread_set_affinity(ptns->kthreads[k], xxx); error = nm_os_kthread_start(ptns->kthreads[k]); if (error) { return error; } } return 0; } static void ptnetmap_stop_kthreads(struct netmap_pt_host_adapter *pth_na) { struct ptnetmap_state *ptns = pth_na->ptns; int num_rings; int k; if (!ptns) { /* Nothing to do. */ return; } ptns->stopped = true; num_rings = ptns->pth_na->up.num_tx_rings + ptns->pth_na->up.num_rx_rings; for (k = 0; k < num_rings; k++) { nm_os_kthread_stop(ptns->kthreads[k]); } } static struct ptnetmap_cfg * ptnetmap_read_cfg(struct nmreq *nmr) { uintptr_t *nmr_ptncfg = (uintptr_t *)&nmr->nr_arg1; struct ptnetmap_cfg *cfg; struct ptnetmap_cfg tmp; size_t cfglen; if (copyin((const void *)*nmr_ptncfg, &tmp, sizeof(tmp))) { D("Partial copyin() failed"); return NULL; } - cfglen = sizeof(tmp) + tmp.num_rings * sizeof(struct ptnet_ring_cfg); + cfglen = sizeof(tmp) + tmp.num_rings * tmp.entry_size; cfg = malloc(cfglen, M_DEVBUF, M_NOWAIT | M_ZERO); if (!cfg) { return NULL; } if (copyin((const void *)*nmr_ptncfg, cfg, cfglen)) { D("Full copyin() failed"); free(cfg, M_DEVBUF); return NULL; } return cfg; } static int nm_unused_notify(struct netmap_kring *, int); static int nm_pt_host_notify(struct netmap_kring *, int); /* Create ptnetmap state and switch parent adapter to ptnetmap mode. */ static int ptnetmap_create(struct netmap_pt_host_adapter *pth_na, struct ptnetmap_cfg *cfg) { - unsigned ft_mask = (PTNETMAP_CFG_FEAT_CSB | PTNETMAP_CFG_FEAT_EVENTFD); struct ptnetmap_state *ptns; unsigned int num_rings; int ret, i; /* Check if ptnetmap state is already there. */ if (pth_na->ptns) { D("ERROR adapter %p already in ptnetmap mode", pth_na->parent); return EINVAL; } - if ((cfg->features & ft_mask) != ft_mask) { - D("ERROR ptnetmap_cfg(%x) does not contain CSB and EVENTFD", - cfg->features); - return EINVAL; - } - num_rings = pth_na->up.num_tx_rings + pth_na->up.num_rx_rings; if (num_rings != cfg->num_rings) { D("ERROR configuration mismatch, expected %u rings, found %u", num_rings, cfg->num_rings); return EINVAL; } ptns = malloc(sizeof(*ptns) + num_rings * sizeof(*ptns->kthreads), M_DEVBUF, M_NOWAIT | M_ZERO); if (!ptns) { return ENOMEM; } ptns->kthreads = (struct nm_kthread **)(ptns + 1); ptns->stopped = true; /* Cross-link data structures. */ pth_na->ptns = ptns; ptns->pth_na = pth_na; /* Store the CSB address provided by the hypervisor. */ ptns->ptrings = cfg->ptrings; DBG(ptnetmap_print_configuration(cfg)); /* Create kthreads */ if ((ret = ptnetmap_create_kthreads(pth_na, cfg))) { D("ERROR ptnetmap_create_kthreads()"); goto err; } /* Copy krings state into the CSB for the guest initialization */ if ((ret = ptnetmap_krings_snapshot(pth_na))) { D("ERROR ptnetmap_krings_snapshot()"); goto err; } /* Overwrite parent nm_notify krings callback. */ pth_na->parent->na_private = pth_na; pth_na->parent_nm_notify = pth_na->parent->nm_notify; pth_na->parent->nm_notify = nm_unused_notify; for (i = 0; i < pth_na->parent->num_rx_rings; i++) { pth_na->up.rx_rings[i].save_notify = pth_na->up.rx_rings[i].nm_notify; pth_na->up.rx_rings[i].nm_notify = nm_pt_host_notify; } for (i = 0; i < pth_na->parent->num_tx_rings; i++) { pth_na->up.tx_rings[i].save_notify = pth_na->up.tx_rings[i].nm_notify; pth_na->up.tx_rings[i].nm_notify = nm_pt_host_notify; } #ifdef RATE memset(&ptns->rate_ctx, 0, sizeof(ptns->rate_ctx)); setup_timer(&ptns->rate_ctx.timer, &rate_callback, (unsigned long)&ptns->rate_ctx); if (mod_timer(&ptns->rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) D("[ptn] Error: mod_timer()\n"); #endif DBG(D("[%s] ptnetmap configuration DONE", pth_na->up.name)); return 0; err: pth_na->ptns = NULL; free(ptns, M_DEVBUF); return ret; } /* Switch parent adapter back to normal mode and destroy * ptnetmap state. */ static void ptnetmap_delete(struct netmap_pt_host_adapter *pth_na) { struct ptnetmap_state *ptns = pth_na->ptns; int num_rings; int i; if (!ptns) { /* Nothing to do. */ return; } /* Restore parent adapter callbacks. */ pth_na->parent->nm_notify = pth_na->parent_nm_notify; pth_na->parent->na_private = NULL; for (i = 0; i < pth_na->parent->num_rx_rings; i++) { pth_na->up.rx_rings[i].nm_notify = pth_na->up.rx_rings[i].save_notify; pth_na->up.rx_rings[i].save_notify = NULL; } for (i = 0; i < pth_na->parent->num_tx_rings; i++) { pth_na->up.tx_rings[i].nm_notify = pth_na->up.tx_rings[i].save_notify; pth_na->up.tx_rings[i].save_notify = NULL; } /* Delete kthreads. */ num_rings = ptns->pth_na->up.num_tx_rings + ptns->pth_na->up.num_rx_rings; for (i = 0; i < num_rings; i++) { nm_os_kthread_delete(ptns->kthreads[i]); ptns->kthreads[i] = NULL; } IFRATE(del_timer(&ptns->rate_ctx.timer)); free(ptns, M_DEVBUF); pth_na->ptns = NULL; DBG(D("[%s] ptnetmap deleted", pth_na->up.name)); } /* * Called by netmap_ioctl(). * Operation is indicated in nmr->nr_cmd. * * Called without NMG_LOCK. */ int ptnetmap_ctl(struct nmreq *nmr, struct netmap_adapter *na) { struct netmap_pt_host_adapter *pth_na; struct ptnetmap_cfg *cfg; char *name; int cmd, error = 0; name = nmr->nr_name; cmd = nmr->nr_cmd; DBG(D("name: %s", name)); if (!nm_ptnetmap_host_on(na)) { D("ERROR Netmap adapter %p is not a ptnetmap host adapter", na); error = ENXIO; goto done; } pth_na = (struct netmap_pt_host_adapter *)na; NMG_LOCK(); switch (cmd) { case NETMAP_PT_HOST_CREATE: /* Read hypervisor configuration from userspace. */ cfg = ptnetmap_read_cfg(nmr); if (!cfg) break; /* Create ptnetmap state (kthreads, ...) and switch parent * adapter to ptnetmap mode. */ error = ptnetmap_create(pth_na, cfg); free(cfg, M_DEVBUF); if (error) break; /* Start kthreads. */ error = ptnetmap_start_kthreads(pth_na); if (error) ptnetmap_delete(pth_na); break; case NETMAP_PT_HOST_DELETE: /* Stop kthreads. */ ptnetmap_stop_kthreads(pth_na); /* Switch parent adapter back to normal mode and destroy * ptnetmap state (kthreads, ...). */ ptnetmap_delete(pth_na); break; default: D("ERROR invalid cmd (nmr->nr_cmd) (0x%x)", cmd); error = EINVAL; break; } NMG_UNLOCK(); done: return error; } /* nm_notify callbacks for ptnetmap */ static int nm_pt_host_notify(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na->na_private; struct ptnetmap_state *ptns; int k; /* First check that the passthrough port is not being destroyed. */ if (unlikely(!pth_na)) { return NM_IRQ_COMPLETED; } ptns = pth_na->ptns; if (unlikely(!ptns || ptns->stopped)) { return NM_IRQ_COMPLETED; } k = kring->ring_id; /* Notify kthreads (wake up if needed) */ if (kring->tx == NR_TX) { ND(1, "TX backend irq"); IFRATE(ptns->rate_ctx.new.btxwu++); } else { k += pth_na->up.num_tx_rings; ND(1, "RX backend irq"); IFRATE(ptns->rate_ctx.new.brxwu++); } nm_os_kthread_wakeup_worker(ptns->kthreads[k]); return NM_IRQ_COMPLETED; } static int nm_unused_notify(struct netmap_kring *kring, int flags) { D("BUG this should never be called"); return ENXIO; } /* nm_config callback for bwrap */ static int nm_pt_host_config(struct netmap_adapter *na, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd) { struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na; struct netmap_adapter *parent = pth_na->parent; int error; //XXX: maybe calling parent->nm_config is better /* forward the request */ error = netmap_update_config(parent); *rxr = na->num_rx_rings = parent->num_rx_rings; *txr = na->num_tx_rings = parent->num_tx_rings; *txd = na->num_tx_desc = parent->num_tx_desc; *rxd = na->num_rx_desc = parent->num_rx_desc; DBG(D("rxr: %d txr: %d txd: %d rxd: %d", *rxr, *txr, *txd, *rxd)); return error; } /* nm_krings_create callback for ptnetmap */ static int nm_pt_host_krings_create(struct netmap_adapter *na) { struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na; struct netmap_adapter *parent = pth_na->parent; enum txrx t; int error; DBG(D("%s", pth_na->up.name)); /* create the parent krings */ error = parent->nm_krings_create(parent); if (error) { return error; } /* A ptnetmap host adapter points the very same krings * as its parent adapter. These pointer are used in the * TX/RX worker functions. */ na->tx_rings = parent->tx_rings; na->rx_rings = parent->rx_rings; na->tailroom = parent->tailroom; for_rx_tx(t) { struct netmap_kring *kring; /* Parent's kring_create function will initialize * its own na->si. We have to init our na->si here. */ nm_os_selinfo_init(&na->si[t]); /* Force the mem_rings_create() method to create the * host rings independently on what the regif asked for: * these rings are needed by the guest ptnetmap adapter * anyway. */ kring = &NMR(na, t)[nma_get_nrings(na, t)]; kring->nr_kflags |= NKR_NEEDRING; } return 0; } /* nm_krings_delete callback for ptnetmap */ static void nm_pt_host_krings_delete(struct netmap_adapter *na) { struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na; struct netmap_adapter *parent = pth_na->parent; DBG(D("%s", pth_na->up.name)); parent->nm_krings_delete(parent); na->tx_rings = na->rx_rings = na->tailroom = NULL; } /* nm_register callback */ static int nm_pt_host_register(struct netmap_adapter *na, int onoff) { struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na; struct netmap_adapter *parent = pth_na->parent; int error; DBG(D("%s onoff %d", pth_na->up.name, onoff)); if (onoff) { /* netmap_do_regif has been called on the ptnetmap na. * We need to pass the information about the * memory allocator to the parent before * putting it in netmap mode */ parent->na_lut = na->na_lut; } /* forward the request to the parent */ error = parent->nm_register(parent, onoff); if (error) return error; if (onoff) { na->na_flags |= NAF_NETMAP_ON | NAF_PTNETMAP_HOST; } else { ptnetmap_delete(pth_na); na->na_flags &= ~(NAF_NETMAP_ON | NAF_PTNETMAP_HOST); } return 0; } /* nm_dtor callback */ static void nm_pt_host_dtor(struct netmap_adapter *na) { struct netmap_pt_host_adapter *pth_na = (struct netmap_pt_host_adapter *)na; struct netmap_adapter *parent = pth_na->parent; DBG(D("%s", pth_na->up.name)); /* The equivalent of NETMAP_PT_HOST_DELETE if the hypervisor * didn't do it. */ ptnetmap_stop_kthreads(pth_na); ptnetmap_delete(pth_na); parent->na_flags &= ~NAF_BUSY; netmap_adapter_put(pth_na->parent); pth_na->parent = NULL; } /* check if nmr is a request for a ptnetmap adapter that we can satisfy */ int netmap_get_pt_host_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { struct nmreq parent_nmr; struct netmap_adapter *parent; /* target adapter */ struct netmap_pt_host_adapter *pth_na; struct ifnet *ifp = NULL; int error; /* Check if it is a request for a ptnetmap adapter */ if ((nmr->nr_flags & (NR_PTNETMAP_HOST)) == 0) { return 0; } D("Requesting a ptnetmap host adapter"); pth_na = malloc(sizeof(*pth_na), M_DEVBUF, M_NOWAIT | M_ZERO); if (pth_na == NULL) { D("ERROR malloc"); return ENOMEM; } /* first, try to find the adapter that we want to passthrough * We use the same nmr, after we have turned off the ptnetmap flag. * In this way we can potentially passthrough everything netmap understands. */ memcpy(&parent_nmr, nmr, sizeof(parent_nmr)); parent_nmr.nr_flags &= ~(NR_PTNETMAP_HOST); error = netmap_get_na(&parent_nmr, &parent, &ifp, create); if (error) { D("parent lookup failed: %d", error); goto put_out_noputparent; } DBG(D("found parent: %s", parent->name)); /* make sure the interface is not already in use */ if (NETMAP_OWNED_BY_ANY(parent)) { D("NIC %s busy, cannot ptnetmap", parent->name); error = EBUSY; goto put_out; } pth_na->parent = parent; /* Follow netmap_attach()-like operations for the host * ptnetmap adapter. */ //XXX pth_na->up.na_flags = parent->na_flags; pth_na->up.num_rx_rings = parent->num_rx_rings; pth_na->up.num_tx_rings = parent->num_tx_rings; pth_na->up.num_tx_desc = parent->num_tx_desc; pth_na->up.num_rx_desc = parent->num_rx_desc; pth_na->up.nm_dtor = nm_pt_host_dtor; pth_na->up.nm_register = nm_pt_host_register; /* Reuse parent's adapter txsync and rxsync methods. */ pth_na->up.nm_txsync = parent->nm_txsync; pth_na->up.nm_rxsync = parent->nm_rxsync; pth_na->up.nm_krings_create = nm_pt_host_krings_create; pth_na->up.nm_krings_delete = nm_pt_host_krings_delete; pth_na->up.nm_config = nm_pt_host_config; /* Set the notify method only or convenience, it will never * be used, since - differently from default krings_create - we * ptnetmap krings_create callback inits kring->nm_notify * directly. */ pth_na->up.nm_notify = nm_unused_notify; pth_na->up.nm_mem = parent->nm_mem; pth_na->up.na_flags |= NAF_HOST_RINGS; error = netmap_attach_common(&pth_na->up); if (error) { D("ERROR netmap_attach_common()"); goto put_out; } *na = &pth_na->up; netmap_adapter_get(*na); /* set parent busy, because attached for ptnetmap */ parent->na_flags |= NAF_BUSY; strncpy(pth_na->up.name, parent->name, sizeof(pth_na->up.name)); strcat(pth_na->up.name, "-PTN"); DBG(D("%s ptnetmap request DONE", pth_na->up.name)); /* drop the reference to the ifp, if any */ if (ifp) if_rele(ifp); return 0; put_out: netmap_adapter_put(parent); if (ifp) if_rele(ifp); put_out_noputparent: free(pth_na, M_DEVBUF); return error; } #endif /* WITH_PTNETMAP_HOST */ #ifdef WITH_PTNETMAP_GUEST /* - * GUEST ptnetmap generic txsync()/rxsync() used in e1000/virtio-net device - * driver notify is set when we need to send notification to the host - * (driver-specific) + * Guest ptnetmap txsync()/rxsync() routines, used in ptnet device drivers. + * These routines are reused across the different operating systems supported + * by netmap. */ /* * Reconcile host and guest views of the transmit ring. * * Guest user wants to transmit packets up to the one before ring->head, * and guest kernel knows tx_ring->hwcur is the first packet unsent * by the host kernel. * * We push out as many packets as possible, and possibly * reclaim buffers from previously completed transmission. * * Notifications from the host are enabled only if the user guest would * block (no space in the ring). */ bool netmap_pt_guest_txsync(struct ptnet_ring *ptring, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ ptring->guest_need_kick = 0; /* * First part: tell the host (updating the CSB) to process the new * packets. */ kring->nr_hwcur = ptring->hwcur; ptnetmap_guest_write_kring_csb(ptring, kring->rcur, kring->rhead); /* Ask for a kick from a guest to the host if needed. */ if ((kring->rhead != kring->nr_hwcur && NM_ACCESS_ONCE(ptring->host_need_kick)) || (flags & NAF_FORCE_RECLAIM)) { ptring->sync_flags = flags; notify = true; } /* * Second part: reclaim buffers for completed transmissions. */ if (nm_kr_txempty(kring) || (flags & NAF_FORCE_RECLAIM)) { ptnetmap_guest_read_kring_csb(ptring, kring); } /* * No more room in the ring for new transmissions. The user thread will * go to sleep and we need to be notified by the host when more free * space is available. */ if (nm_kr_txempty(kring)) { /* Reenable notifications. */ ptring->guest_need_kick = 1; /* Double check */ ptnetmap_guest_read_kring_csb(ptring, kring); /* If there is new free space, disable notifications */ if (unlikely(!nm_kr_txempty(kring))) { ptring->guest_need_kick = 0; } } ND(1, "TX - CSB: head:%u cur:%u hwtail:%u - KRING: head:%u cur:%u tail: %u", ptring->head, ptring->cur, ptring->hwtail, kring->rhead, kring->rcur, kring->nr_hwtail); return notify; } /* * Reconcile host and guest view of the receive ring. * * Update hwcur/hwtail from host (reading from CSB). * * If guest user has released buffers up to the one before ring->head, we * also give them to the host. * * Notifications from the host are enabled only if the user guest would * block (no more completed slots in the ring). */ bool netmap_pt_guest_rxsync(struct ptnet_ring *ptring, struct netmap_kring *kring, int flags) { bool notify = false; /* Disable notifications */ ptring->guest_need_kick = 0; /* * First part: import newly received packets, by updating the kring * hwtail to the hwtail known from the host (read from the CSB). * This also updates the kring hwcur. */ ptnetmap_guest_read_kring_csb(ptring, kring); kring->nr_kflags &= ~NKR_PENDINTR; /* * Second part: tell the host about the slots that guest user has * released, by updating cur and head in the CSB. */ if (kring->rhead != kring->nr_hwcur) { ptnetmap_guest_write_kring_csb(ptring, kring->rcur, kring->rhead); /* Ask for a kick from the guest to the host if needed. */ if (NM_ACCESS_ONCE(ptring->host_need_kick)) { ptring->sync_flags = flags; notify = true; } } /* * No more completed RX slots. The user thread will go to sleep and * we need to be notified by the host when more RX slots have been * completed. */ if (nm_kr_rxempty(kring)) { /* Reenable notifications. */ ptring->guest_need_kick = 1; /* Double check */ ptnetmap_guest_read_kring_csb(ptring, kring); /* If there are new slots, disable notifications. */ if (!nm_kr_rxempty(kring)) { ptring->guest_need_kick = 0; } } ND(1, "RX - CSB: head:%u cur:%u hwtail:%u - KRING: head:%u cur:%u", ptring->head, ptring->cur, ptring->hwtail, kring->rhead, kring->rcur); return notify; } /* * Callbacks for ptnet drivers: nm_krings_create, nm_krings_delete, nm_dtor. */ int ptnet_nm_krings_create(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; int ret; if (ptna->backend_regifs) { return 0; } /* Create krings on the public netmap adapter. */ ret = netmap_hw_krings_create(na_nm); if (ret) { return ret; } /* Copy krings into the netmap adapter private to the driver. */ na_dr->tx_rings = na_nm->tx_rings; na_dr->rx_rings = na_nm->rx_rings; return 0; } void ptnet_nm_krings_delete(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; /* Upcast. */ struct netmap_adapter *na_nm = &ptna->hwup.up; struct netmap_adapter *na_dr = &ptna->dr.up; if (ptna->backend_regifs) { return; } na_dr->tx_rings = NULL; na_dr->rx_rings = NULL; netmap_hw_krings_delete(na_nm); } void ptnet_nm_dtor(struct netmap_adapter *na) { struct netmap_pt_guest_adapter *ptna = (struct netmap_pt_guest_adapter *)na; netmap_mem_put(ptna->dr.up.nm_mem); memset(&ptna->dr, 0, sizeof(ptna->dr)); netmap_mem_pt_guest_ifp_del(na->nm_mem, na->ifp); } #endif /* WITH_PTNETMAP_GUEST */ Index: user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_vale.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_vale.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/netmap/netmap_vale.c (revision 308054) @@ -1,2778 +1,2778 @@ /* * Copyright (C) 2013-2016 Universita` di Pisa * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module implements the VALE switch for netmap --- VALE SWITCH --- NMG_LOCK() serializes all modifications to switches and ports. A switch cannot be deleted until all ports are gone. For each switch, an SX lock (RWlock on linux) protects deletion of ports. When configuring or deleting a new port, the lock is acquired in exclusive mode (after holding NMG_LOCK). When forwarding, the lock is acquired in shared mode (without NMG_LOCK). The lock is held throughout the entire forwarding cycle, during which the thread may incur in a page fault. Hence it is important that sleepable shared locks are used. On the rx ring, the per-port lock is grabbed initially to reserve a number of slot in the ring, then the lock is released, packets are copied from source to destination, and then the lock is acquired again and the receive ring is updated. (A similar thing is done on the tx ring for NIC and host stack ports attached to the switch) */ /* * OS-specific code that is used only within this file. * Other OS-specific code that must be accessed by drivers * is present in netmap_kern.h */ #if defined(__FreeBSD__) #include /* prerequisite */ __FBSDID("$FreeBSD$"); #include #include #include /* defines used in kernel.h */ #include /* types used in module initialization */ #include /* cdevsw struct, UID, GID */ #include #include /* struct socket */ #include #include #include #include /* sockaddrs */ #include #include #include #include #include /* BIOCIMMEDIATE */ #include /* bus_dmamap_* */ #include #include #define BDG_RWLOCK_T struct rwlock // struct rwlock #define BDG_RWINIT(b) \ rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) #elif defined(linux) #include "bsd_glue.h" #elif defined(__APPLE__) #warning OSX support is only partial #include "osx_glue.h" #elif defined(_WIN32) #include "win_glue.h" #else #error Unsupported platform #endif /* unsupported */ /* * common headers */ #include #include #include #ifdef WITH_VALE /* * system parameters (most of them in netmap_kern.h) * NM_BDG_NAME prefix for switch port names, default "vale" * NM_BDG_MAXPORTS number of ports * NM_BRIDGES max number of switches in the system. * XXX should become a sysctl or tunable * * Switch ports are named valeX:Y where X is the switch name and Y * is the port. If Y matches a physical interface name, the port is * connected to a physical device. * * Unlike physical interfaces, switch ports use their own memory region * for rings and buffers. * The virtual interfaces use per-queue lock instead of core lock. * In the tx loop, we aggregate traffic in batches to make all operations * faster. The batch size is bridge_batch. */ #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ #define NM_BDG_HASH 1024 /* forwarding table entries */ #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ #define NM_MULTISEG 64 /* max size of a chain of bufs */ /* actual size of the tables */ #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) /* NM_FT_NULL terminates a list of slots in the ft */ #define NM_FT_NULL NM_BDG_BATCH_MAX /* * bridge_batch is set via sysctl to the max batch size to be * used in the bridge. The actual value may be larger as the * last packet in the block may overflow the size. */ static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ SYSBEGIN(vars_vale); SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); SYSEND; static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); static int netmap_vp_reg(struct netmap_adapter *na, int onoff); static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); /* * For each output interface, nm_bdg_q is used to construct a list. * bq_len is the number of output buffers (we can have coalescing * during the copy). */ struct nm_bdg_q { uint16_t bq_head; uint16_t bq_tail; uint32_t bq_len; /* number of buffers */ }; /* XXX revise this */ struct nm_hash_ent { uint64_t mac; /* the top 2 bytes are the epoch */ uint64_t ports; }; /* * nm_bridge is a descriptor for a VALE switch. * Interfaces for a bridge are all in bdg_ports[]. * The array has fixed size, an empty entry does not terminate * the search, but lookups only occur on attach/detach so we * don't mind if they are slow. * * The bridge is non blocking on the transmit ports: excess * packets are dropped if there is no room on the output port. * * bdg_lock protects accesses to the bdg_ports array. * This is a rw lock (or equivalent). */ struct nm_bridge { /* XXX what is the proper alignment/layout ? */ BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ int bdg_namelen; uint32_t bdg_active_ports; /* 0 means free */ char bdg_basename[IFNAMSIZ]; /* Indexes of active ports (up to active_ports) * and all other remaining ports. */ uint8_t bdg_port_index[NM_BDG_MAXPORTS]; struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; /* * The function to decide the destination port. * It returns either of an index of the destination port, * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to * forward this packet. ring_nr is the source ring index, and the * function may overwrite this value to forward this packet to a * different ring index. * This function must be set by netmap_bdg_ctl(). */ struct netmap_bdg_ops bdg_ops; /* the forwarding table, MAC+ports. * XXX should be changed to an argument to be passed to * the lookup function, and allocated on attach */ struct nm_hash_ent ht[NM_BDG_HASH]; #ifdef CONFIG_NET_NS struct net *ns; #endif /* CONFIG_NET_NS */ }; const char* netmap_bdg_name(struct netmap_vp_adapter *vp) { struct nm_bridge *b = vp->na_bdg; if (b == NULL) return NULL; return b->bdg_basename; } #ifndef CONFIG_NET_NS /* * XXX in principle nm_bridges could be created dynamically * Right now we have a static array and deletions are protected * by an exclusive lock. */ static struct nm_bridge *nm_bridges; #endif /* !CONFIG_NET_NS */ /* * this is a slightly optimized copy routine which rounds * to multiple of 64 bytes and is often faster than dealing * with other odd sizes. We assume there is enough room * in the source and destination buffers. * * XXX only for multiples of 64 bytes, non overlapped. */ static inline void pkt_copy(void *_src, void *_dst, int l) { uint64_t *src = _src; uint64_t *dst = _dst; if (unlikely(l >= 1024)) { memcpy(dst, src, l); return; } for (; likely(l > 0); l-=64) { *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; } } static int nm_is_id_char(const char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || (c == '_'); } /* Validate the name of a VALE bridge port and return the * position of the ":" character. */ static int nm_vale_name_validate(const char *name) { int colon_pos = -1; int i; if (!name || strlen(name) < strlen(NM_BDG_NAME)) { return -1; } for (i = 0; name[i]; i++) { if (name[i] == ':') { if (colon_pos != -1) { return -1; } colon_pos = i; } else if (!nm_is_id_char(name[i])) { return -1; } } if (i >= IFNAMSIZ) { return -1; } return colon_pos; } /* * locate a bridge among the existing ones. * MUST BE CALLED WITH NMG_LOCK() * * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. * We assume that this is called with a name of at least NM_NAME chars. */ static struct nm_bridge * nm_find_bridge(const char *name, int create) { int i, namelen; struct nm_bridge *b = NULL, *bridges; u_int num_bridges; NMG_LOCK_ASSERT(); netmap_bns_getbridges(&bridges, &num_bridges); namelen = nm_vale_name_validate(name); if (namelen < 0) { D("invalid bridge name %s", name ? name : NULL); return NULL; } /* lookup the name, remember empty slot if there is one */ for (i = 0; i < num_bridges; i++) { struct nm_bridge *x = bridges + i; if (x->bdg_active_ports == 0) { if (create && b == NULL) b = x; /* record empty slot */ } else if (x->bdg_namelen != namelen) { continue; } else if (strncmp(name, x->bdg_basename, namelen) == 0) { ND("found '%.*s' at %d", namelen, name, i); b = x; break; } } if (i == num_bridges && b) { /* name not found, can create entry */ /* initialize the bridge */ strncpy(b->bdg_basename, name, namelen); ND("create new bridge %s with ports %d", b->bdg_basename, b->bdg_active_ports); b->bdg_namelen = namelen; b->bdg_active_ports = 0; for (i = 0; i < NM_BDG_MAXPORTS; i++) b->bdg_port_index[i] = i; /* set the default function */ b->bdg_ops.lookup = netmap_bdg_learning; /* reset the MAC address table */ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); NM_BNS_GET(b); } return b; } /* * Free the forwarding tables for rings attached to switch ports. */ static void nm_free_bdgfwd(struct netmap_adapter *na) { int nrings, i; struct netmap_kring *kring; NMG_LOCK_ASSERT(); nrings = na->num_tx_rings; kring = na->tx_rings; for (i = 0; i < nrings; i++) { if (kring[i].nkr_ft) { free(kring[i].nkr_ft, M_DEVBUF); kring[i].nkr_ft = NULL; /* protect from freeing twice */ } } } /* * Allocate the forwarding tables for the rings attached to the bridge ports. */ static int nm_alloc_bdgfwd(struct netmap_adapter *na) { int nrings, l, i, num_dstq; struct netmap_kring *kring; NMG_LOCK_ASSERT(); /* all port:rings + broadcast */ num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; l += sizeof(struct nm_bdg_q) * num_dstq; l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; nrings = netmap_real_rings(na, NR_TX); kring = na->tx_rings; for (i = 0; i < nrings; i++) { struct nm_bdg_fwd *ft; struct nm_bdg_q *dstq; int j; ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); if (!ft) { nm_free_bdgfwd(na); return ENOMEM; } dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); for (j = 0; j < num_dstq; j++) { dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; dstq[j].bq_len = 0; } kring[i].nkr_ft = ft; } return 0; } /* remove from bridge b the ports in slots hw and sw * (sw can be -1 if not needed) */ static void netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) { int s_hw = hw, s_sw = sw; int i, lim =b->bdg_active_ports; uint8_t tmp[NM_BDG_MAXPORTS]; /* New algorithm: make a copy of bdg_port_index; lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port in the array of bdg_port_index, replacing them with entries from the bottom of the array; decrement bdg_active_ports; acquire BDG_WLOCK() and copy back the array. */ if (netmap_verbose) D("detach %d and %d (lim %d)", hw, sw, lim); /* make a copy of the list of active ports, update it, * and then copy back within BDG_WLOCK(). */ memcpy(tmp, b->bdg_port_index, sizeof(tmp)); for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { if (hw >= 0 && tmp[i] == hw) { ND("detach hw %d at %d", hw, i); lim--; /* point to last active port */ tmp[i] = tmp[lim]; /* swap with i */ tmp[lim] = hw; /* now this is inactive */ hw = -1; } else if (sw >= 0 && tmp[i] == sw) { ND("detach sw %d at %d", sw, i); lim--; tmp[i] = tmp[lim]; tmp[lim] = sw; sw = -1; } else { i++; } } if (hw >= 0 || sw >= 0) { D("XXX delete failed hw %d sw %d, should panic...", hw, sw); } BDG_WLOCK(b); if (b->bdg_ops.dtor) b->bdg_ops.dtor(b->bdg_ports[s_hw]); b->bdg_ports[s_hw] = NULL; if (s_sw >= 0) { b->bdg_ports[s_sw] = NULL; } memcpy(b->bdg_port_index, tmp, sizeof(tmp)); b->bdg_active_ports = lim; BDG_WUNLOCK(b); ND("now %d active ports", lim); if (lim == 0) { ND("marking bridge %s as free", b->bdg_basename); bzero(&b->bdg_ops, sizeof(b->bdg_ops)); NM_BNS_PUT(b); } } /* nm_bdg_ctl callback for VALE ports */ static int netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; struct nm_bridge *b = vpna->na_bdg; (void)nmr; // XXX merge ? if (attach) return 0; /* nothing to do */ if (b) { netmap_set_all_rings(na, 0 /* disable */); netmap_bdg_detach_common(b, vpna->bdg_port, -1); vpna->na_bdg = NULL; netmap_set_all_rings(na, 1 /* enable */); } /* I have took reference just for attach */ netmap_adapter_put(na); return 0; } /* nm_dtor callback for ephemeral VALE ports */ static void netmap_vp_dtor(struct netmap_adapter *na) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; struct nm_bridge *b = vpna->na_bdg; ND("%s has %d references", na->name, na->na_refcount); if (b) { netmap_bdg_detach_common(b, vpna->bdg_port, -1); } } /* remove a persistent VALE port from the system */ static int nm_vi_destroy(const char *name) { struct ifnet *ifp; int error; ifp = ifunit_ref(name); if (!ifp) return ENXIO; NMG_LOCK(); /* make sure this is actually a VALE port */ if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { error = EINVAL; goto err; } if (NA(ifp)->na_refcount > 1) { error = EBUSY; goto err; } NMG_UNLOCK(); D("destroying a persistent vale interface %s", ifp->if_xname); /* Linux requires all the references are released * before unregister */ if_rele(ifp); netmap_detach(ifp); nm_os_vi_detach(ifp); return 0; err: NMG_UNLOCK(); if_rele(ifp); return error; } /* * Create a virtual interface registered to the system. * The interface will be attached to a bridge later. */ static int nm_vi_create(struct nmreq *nmr) { struct ifnet *ifp; struct netmap_vp_adapter *vpna; int error; /* don't include VALE prefix */ if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) return EINVAL; ifp = ifunit_ref(nmr->nr_name); if (ifp) { /* already exist, cannot create new one */ if_rele(ifp); return EEXIST; } error = nm_os_vi_persist(nmr->nr_name, &ifp); if (error) return error; NMG_LOCK(); /* netmap_vp_create creates a struct netmap_vp_adapter */ error = netmap_vp_create(nmr, ifp, &vpna); if (error) { D("error %d", error); nm_os_vi_detach(ifp); return error; } /* persist-specific routines */ vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; netmap_adapter_get(&vpna->up); NM_ATTACH_NA(ifp, &vpna->up); NMG_UNLOCK(); D("created %s", ifp->if_xname); return 0; } /* Try to get a reference to a netmap adapter attached to a VALE switch. * If the adapter is found (or is created), this function returns 0, a * non NULL pointer is returned into *na, and the caller holds a * reference to the adapter. * If an adapter is not found, then no reference is grabbed and the * function returns an error code, or 0 if there is just a VALE prefix * mismatch. Therefore the caller holds a reference when * (*na != NULL && return == 0). */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { char *nr_name = nmr->nr_name; const char *ifname; struct ifnet *ifp; int error = 0; struct netmap_vp_adapter *vpna, *hostna = NULL; struct nm_bridge *b; int i, j, cand = -1, cand2 = -1; int needed; *na = NULL; /* default return value */ /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { return 0; /* no error, but no VALE prefix */ } b = nm_find_bridge(nr_name, create); if (b == NULL) { D("no bridges available for '%s'", nr_name); return (create ? ENOMEM : ENXIO); } if (strlen(nr_name) < b->bdg_namelen) /* impossible */ panic("x"); /* Now we are sure that name starts with the bridge's name, * lookup the port in the bridge. We need to scan the entire * list. It is not important to hold a WLOCK on the bridge * during the search because NMG_LOCK already guarantees * that there are no other possible writers. */ /* lookup in the local list of ports */ for (j = 0; j < b->bdg_active_ports; j++) { i = b->bdg_port_index[j]; vpna = b->bdg_ports[i]; // KASSERT(na != NULL); ND("checking %s", vpna->up.name); if (!strcmp(vpna->up.name, nr_name)) { netmap_adapter_get(&vpna->up); ND("found existing if %s refs %d", nr_name) *na = &vpna->up; return 0; } } /* not found, should we create it? */ if (!create) return ENXIO; /* yes we should, see if we have space to attach entries */ needed = 2; /* in some cases we only need 1 */ if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { D("bridge full %d, cannot create new port", b->bdg_active_ports); return ENOMEM; } /* record the next two ports available, but do not allocate yet */ cand = b->bdg_port_index[b->bdg_active_ports]; cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; ND("+++ bridge %s port %s used %d avail %d %d", b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); /* * try see if there is a matching NIC with this name * (after the bridge's name) */ ifname = nr_name + b->bdg_namelen + 1; ifp = ifunit_ref(ifname); if (!ifp) { /* Create an ephemeral virtual port * This block contains all the ephemeral-specific logics */ if (nmr->nr_cmd) { /* nr_cmd must be 0 for a virtual port */ return EINVAL; } /* bdg_netmap_attach creates a struct netmap_adapter */ error = netmap_vp_create(nmr, NULL, &vpna); if (error) { D("error %d", error); free(ifp, M_DEVBUF); return error; } /* shortcut - we can skip get_hw_na(), * ownership check and nm_bdg_attach() */ } else { struct netmap_adapter *hw; error = netmap_get_hw_na(ifp, &hw); if (error || hw == NULL) goto out; /* host adapter might not be created */ error = hw->nm_bdg_attach(nr_name, hw); if (error) goto out; vpna = hw->na_vp; hostna = hw->na_hostvp; if (nmr->nr_arg1 != NETMAP_BDG_HOST) hostna = NULL; } BDG_WLOCK(b); vpna->bdg_port = cand; ND("NIC %p to bridge port %d", vpna, cand); /* bind the port to the bridge (virtual ports are not active) */ b->bdg_ports[cand] = vpna; vpna->na_bdg = b; b->bdg_active_ports++; if (hostna != NULL) { /* also bind the host stack to the bridge */ b->bdg_ports[cand2] = hostna; hostna->bdg_port = cand2; hostna->na_bdg = b; b->bdg_active_ports++; ND("host %p to bridge port %d", hostna, cand2); } ND("if %s refs %d", ifname, vpna->up.na_refcount); BDG_WUNLOCK(b); *na = &vpna->up; netmap_adapter_get(*na); return 0; out: if_rele(ifp); return error; } /* Process NETMAP_BDG_ATTACH */ static int nm_bdg_ctl_attach(struct nmreq *nmr) { struct netmap_adapter *na; int error; NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); if (error) /* no device */ goto unlock_exit; if (na == NULL) { /* VALE prefix missing */ error = EINVAL; goto unlock_exit; } if (NETMAP_OWNED_BY_ANY(na)) { error = EBUSY; goto unref_exit; } if (na->nm_bdg_ctl) { /* nop for VALE ports. The bwrap needs to put the hwna * in netmap mode (see netmap_bwrap_bdg_ctl) */ error = na->nm_bdg_ctl(na, nmr, 1); if (error) goto unref_exit; ND("registered %s to netmap-mode", na->name); } NMG_UNLOCK(); return 0; unref_exit: netmap_adapter_put(na); unlock_exit: NMG_UNLOCK(); return error; } static inline int nm_is_bwrap(struct netmap_adapter *na) { return na->nm_register == netmap_bwrap_reg; } /* process NETMAP_BDG_DETACH */ static int nm_bdg_ctl_detach(struct nmreq *nmr) { struct netmap_adapter *na; int error; NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); if (error) { /* no device, or another bridge or user owns the device */ goto unlock_exit; } if (na == NULL) { /* VALE prefix missing */ error = EINVAL; goto unlock_exit; } else if (nm_is_bwrap(na) && ((struct netmap_bwrap_adapter *)na)->na_polling_state) { /* Don't detach a NIC with polling */ error = EBUSY; netmap_adapter_put(na); goto unlock_exit; } if (na->nm_bdg_ctl) { /* remove the port from bridge. The bwrap * also needs to put the hwna in normal mode */ error = na->nm_bdg_ctl(na, nmr, 0); } netmap_adapter_put(na); unlock_exit: NMG_UNLOCK(); return error; } struct nm_bdg_polling_state; struct nm_bdg_kthread { struct nm_kthread *nmk; u_int qfirst; u_int qlast; struct nm_bdg_polling_state *bps; }; struct nm_bdg_polling_state { bool configured; bool stopped; struct netmap_bwrap_adapter *bna; u_int reg; u_int qfirst; u_int qlast; u_int cpu_from; u_int ncpus; struct nm_bdg_kthread *kthreads; }; static void netmap_bwrap_polling(void *data) { struct nm_bdg_kthread *nbk = data; struct netmap_bwrap_adapter *bna; u_int qfirst, qlast, i; struct netmap_kring *kring0, *kring; if (!nbk) return; qfirst = nbk->qfirst; qlast = nbk->qlast; bna = nbk->bps->bna; kring0 = NMR(bna->hwna, NR_RX); for (i = qfirst; i < qlast; i++) { kring = kring0 + i; kring->nm_notify(kring, 0); } } static int nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) { struct nm_kthread_cfg kcfg; int i, j; bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus, M_DEVBUF, M_NOWAIT | M_ZERO); if (bps->kthreads == NULL) return ENOMEM; bzero(&kcfg, sizeof(kcfg)); kcfg.worker_fn = netmap_bwrap_polling; for (i = 0; i < bps->ncpus; i++) { struct nm_bdg_kthread *t = bps->kthreads + i; int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); int affinity = bps->cpu_from + i; t->bps = bps; t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; t->qlast = all ? bps->qlast : t->qfirst + 1; D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, t->qlast); kcfg.type = i; kcfg.worker_private = t; - t->nmk = nm_os_kthread_create(&kcfg); + t->nmk = nm_os_kthread_create(&kcfg, 0, NULL); if (t->nmk == NULL) { goto cleanup; } nm_os_kthread_set_affinity(t->nmk, affinity); } return 0; cleanup: for (j = 0; j < i; j++) { struct nm_bdg_kthread *t = bps->kthreads + i; nm_os_kthread_delete(t->nmk); } free(bps->kthreads, M_DEVBUF); return EFAULT; } /* a version of ptnetmap_start_kthreads() */ static int nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) { int error, i, j; if (!bps) { D("polling is not configured"); return EFAULT; } bps->stopped = false; for (i = 0; i < bps->ncpus; i++) { struct nm_bdg_kthread *t = bps->kthreads + i; error = nm_os_kthread_start(t->nmk); if (error) { D("error in nm_kthread_start()"); goto cleanup; } } return 0; cleanup: for (j = 0; j < i; j++) { struct nm_bdg_kthread *t = bps->kthreads + i; nm_os_kthread_stop(t->nmk); } bps->stopped = true; return error; } static void nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) { int i; if (!bps) return; for (i = 0; i < bps->ncpus; i++) { struct nm_bdg_kthread *t = bps->kthreads + i; nm_os_kthread_stop(t->nmk); nm_os_kthread_delete(t->nmk); } bps->stopped = true; } static int get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, struct nm_bdg_polling_state *bps) { int req_cpus, avail_cpus, core_from; u_int reg, i, qfirst, qlast; avail_cpus = nm_os_ncpus(); req_cpus = nmr->nr_arg1; if (req_cpus == 0) { D("req_cpus must be > 0"); return EINVAL; } else if (req_cpus >= avail_cpus) { D("for safety, we need at least one core left in the system"); return EINVAL; } reg = nmr->nr_flags & NR_REG_MASK; i = nmr->nr_ringid & NETMAP_RING_MASK; /* * ONE_NIC: dedicate one core to one ring. If multiple cores * are specified, consecutive rings are also polled. * For example, if ringid=2 and 2 cores are given, * ring 2 and 3 are polled by core 2 and 3, respectively. * ALL_NIC: poll all the rings using a core specified by ringid. * the number of cores must be 1. */ if (reg == NR_REG_ONE_NIC) { if (i + req_cpus > nma_get_nrings(na, NR_RX)) { D("only %d rings exist (ring %u-%u is given)", nma_get_nrings(na, NR_RX), i, i+req_cpus); return EINVAL; } qfirst = i; qlast = qfirst + req_cpus; core_from = qfirst; } else if (reg == NR_REG_ALL_NIC) { if (req_cpus != 1) { D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); return EINVAL; } qfirst = 0; qlast = nma_get_nrings(na, NR_RX); core_from = i; } else { D("reg must be ALL_NIC or ONE_NIC"); return EINVAL; } bps->reg = reg; bps->qfirst = qfirst; bps->qlast = qlast; bps->cpu_from = core_from; bps->ncpus = req_cpus; D("%s qfirst %u qlast %u cpu_from %u ncpus %u", reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", qfirst, qlast, core_from, req_cpus); return 0; } static int nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) { struct nm_bdg_polling_state *bps; struct netmap_bwrap_adapter *bna; int error; bna = (struct netmap_bwrap_adapter *)na; if (bna->na_polling_state) { D("ERROR adapter already in polling mode"); return EFAULT; } bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO); if (!bps) return ENOMEM; bps->configured = false; bps->stopped = true; if (get_polling_cfg(nmr, na, bps)) { free(bps, M_DEVBUF); return EINVAL; } if (nm_bdg_create_kthreads(bps)) { free(bps, M_DEVBUF); return EFAULT; } bps->configured = true; bna->na_polling_state = bps; bps->bna = bna; /* disable interrupt if possible */ if (bna->hwna->nm_intr) bna->hwna->nm_intr(bna->hwna, 0); /* start kthread now */ error = nm_bdg_polling_start_kthreads(bps); if (error) { D("ERROR nm_bdg_polling_start_kthread()"); free(bps->kthreads, M_DEVBUF); free(bps, M_DEVBUF); bna->na_polling_state = NULL; if (bna->hwna->nm_intr) bna->hwna->nm_intr(bna->hwna, 1); } return error; } static int nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct nm_bdg_polling_state *bps; if (!bna->na_polling_state) { D("ERROR adapter is not in polling mode"); return EFAULT; } bps = bna->na_polling_state; nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); bps->configured = false; free(bps, M_DEVBUF); bna->na_polling_state = NULL; /* reenable interrupt */ if (bna->hwna->nm_intr) bna->hwna->nm_intr(bna->hwna, 1); return 0; } /* Called by either user's context (netmap_ioctl()) * or external kernel modules (e.g., Openvswitch). * Operation is indicated in nmr->nr_cmd. * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge * requires bdg_ops argument; the other commands ignore this argument. * * Called without NMG_LOCK. */ int netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) { struct nm_bridge *b, *bridges; struct netmap_adapter *na; struct netmap_vp_adapter *vpna; char *name = nmr->nr_name; int cmd = nmr->nr_cmd, namelen = strlen(name); int error = 0, i, j; u_int num_bridges; netmap_bns_getbridges(&bridges, &num_bridges); switch (cmd) { case NETMAP_BDG_NEWIF: error = nm_vi_create(nmr); break; case NETMAP_BDG_DELIF: error = nm_vi_destroy(nmr->nr_name); break; case NETMAP_BDG_ATTACH: error = nm_bdg_ctl_attach(nmr); break; case NETMAP_BDG_DETACH: error = nm_bdg_ctl_detach(nmr); break; case NETMAP_BDG_LIST: /* this is used to enumerate bridges and ports */ if (namelen) { /* look up indexes of bridge and port */ if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { error = EINVAL; break; } NMG_LOCK(); b = nm_find_bridge(name, 0 /* don't create */); if (!b) { error = ENOENT; NMG_UNLOCK(); break; } error = 0; nmr->nr_arg1 = b - bridges; /* bridge index */ nmr->nr_arg2 = NM_BDG_NOPORT; for (j = 0; j < b->bdg_active_ports; j++) { i = b->bdg_port_index[j]; vpna = b->bdg_ports[i]; if (vpna == NULL) { D("---AAAAAAAAARGH-------"); continue; } /* the former and the latter identify a * virtual port and a NIC, respectively */ if (!strcmp(vpna->up.name, name)) { nmr->nr_arg2 = i; /* port index */ break; } } NMG_UNLOCK(); } else { /* return the first non-empty entry starting from * bridge nr_arg1 and port nr_arg2. * * Users can detect the end of the same bridge by * seeing the new and old value of nr_arg1, and can * detect the end of all the bridge by error != 0 */ i = nmr->nr_arg1; j = nmr->nr_arg2; NMG_LOCK(); for (error = ENOENT; i < NM_BRIDGES; i++) { b = bridges + i; if (j >= b->bdg_active_ports) { j = 0; /* following bridges scan from 0 */ continue; } nmr->nr_arg1 = i; nmr->nr_arg2 = j; j = b->bdg_port_index[j]; vpna = b->bdg_ports[j]; strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); error = 0; break; } NMG_UNLOCK(); } break; case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ /* register callbacks to the given bridge. * nmr->nr_name may be just bridge's name (including ':' * if it is not just NM_NAME). */ if (!bdg_ops) { error = EINVAL; break; } NMG_LOCK(); b = nm_find_bridge(name, 0 /* don't create */); if (!b) { error = EINVAL; } else { b->bdg_ops = *bdg_ops; } NMG_UNLOCK(); break; case NETMAP_BDG_VNET_HDR: /* Valid lengths for the virtio-net header are 0 (no header), 10 and 12. */ if (nmr->nr_arg1 != 0 && nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && nmr->nr_arg1 != 12) { error = EINVAL; break; } NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { vpna = (struct netmap_vp_adapter *)na; na->virt_hdr_len = nmr->nr_arg1; if (na->virt_hdr_len) { vpna->mfs = NETMAP_BUF_SIZE(na); } D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); netmap_adapter_put(na); } else if (!na) { error = ENXIO; } NMG_UNLOCK(); break; case NETMAP_BDG_POLLING_ON: case NETMAP_BDG_POLLING_OFF: NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { if (!nm_is_bwrap(na)) { error = EOPNOTSUPP; } else if (cmd == NETMAP_BDG_POLLING_ON) { error = nm_bdg_ctl_polling_start(nmr, na); if (!error) netmap_adapter_get(na); } else { error = nm_bdg_ctl_polling_stop(nmr, na); if (!error) netmap_adapter_put(na); } netmap_adapter_put(na); } NMG_UNLOCK(); break; default: D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); error = EINVAL; break; } return error; } int netmap_bdg_config(struct nmreq *nmr) { struct nm_bridge *b; int error = EINVAL; NMG_LOCK(); b = nm_find_bridge(nmr->nr_name, 0); if (!b) { NMG_UNLOCK(); return error; } NMG_UNLOCK(); /* Don't call config() with NMG_LOCK() held */ BDG_RLOCK(b); if (b->bdg_ops.config != NULL) error = b->bdg_ops.config((struct nm_ifreq *)nmr); BDG_RUNLOCK(b); return error; } /* nm_krings_create callback for VALE ports. * Calls the standard netmap_krings_create, then adds leases on rx * rings and bdgfwd on tx rings. */ static int netmap_vp_krings_create(struct netmap_adapter *na) { u_int tailroom; int error, i; uint32_t *leases; u_int nrx = netmap_real_rings(na, NR_RX); /* * Leases are attached to RX rings on vale ports */ tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; error = netmap_krings_create(na, tailroom); if (error) return error; leases = na->tailroom; for (i = 0; i < nrx; i++) { /* Receive rings */ na->rx_rings[i].nkr_leases = leases; leases += na->num_rx_desc; } error = nm_alloc_bdgfwd(na); if (error) { netmap_krings_delete(na); return error; } return 0; } /* nm_krings_delete callback for VALE ports. */ static void netmap_vp_krings_delete(struct netmap_adapter *na) { nm_free_bdgfwd(na); netmap_krings_delete(na); } static int nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, u_int ring_nr); /* * main dispatch routine for the bridge. * Grab packets from a kring, move them into the ft structure * associated to the tx (input) port. Max one instance per port, * filtered on input (ioctl, poll or XXX). * Returns the next position in the ring. */ static int nm_bdg_preflush(struct netmap_kring *kring, u_int end) { struct netmap_vp_adapter *na = (struct netmap_vp_adapter*)kring->na; struct netmap_ring *ring = kring->ring; struct nm_bdg_fwd *ft; u_int ring_nr = kring->ring_id; u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; u_int ft_i = 0; /* start from 0 */ u_int frags = 1; /* how many frags ? */ struct nm_bridge *b = na->na_bdg; /* To protect against modifications to the bridge we acquire a * shared lock, waiting if we can sleep (if the source port is * attached to a user process) or with a trylock otherwise (NICs). */ ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); if (na->up.na_flags & NAF_BDG_MAYSLEEP) BDG_RLOCK(b); else if (!BDG_RTRYLOCK(b)) return 0; ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); ft = kring->nkr_ft; for (; likely(j != end); j = nm_next(j, lim)) { struct netmap_slot *slot = &ring->slot[j]; char *buf; ft[ft_i].ft_len = slot->len; ft[ft_i].ft_flags = slot->flags; ND("flags is 0x%x", slot->flags); /* we do not use the buf changed flag, but we still need to reset it */ slot->flags &= ~NS_BUF_CHANGED; /* this slot goes into a list so initialize the link field */ ft[ft_i].ft_next = NM_FT_NULL; buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); if (unlikely(buf == NULL)) { RD(5, "NULL %s buffer pointer from %s slot %d len %d", (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", kring->name, j, ft[ft_i].ft_len); buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); ft[ft_i].ft_len = 0; ft[ft_i].ft_flags = 0; } __builtin_prefetch(buf); ++ft_i; if (slot->flags & NS_MOREFRAG) { frags++; continue; } if (unlikely(netmap_verbose && frags > 1)) RD(5, "%d frags at %d", frags, ft_i - frags); ft[ft_i - frags].ft_frags = frags; frags = 1; if (unlikely((int)ft_i >= bridge_batch)) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); } if (frags > 1) { /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we * have to fix frags count. */ frags--; ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; ft[ft_i - frags].ft_frags = frags; D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); } if (ft_i) ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); BDG_RUNLOCK(b); return j; } /* ----- FreeBSD if_bridge hash function ------- */ /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). * * http://www.burtleburtle.net/bob/hash/spooky.html */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) static __inline uint32_t nm_bridge_rthash(const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; mix(a, b, c); #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) return (c & BRIDGE_RTHASH_MASK); } #undef mix /* nm_register callback for VALE ports */ static int netmap_vp_reg(struct netmap_adapter *na, int onoff) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; enum txrx t; int i; /* persistent ports may be put in netmap mode * before being attached to a bridge */ if (vpna->na_bdg) BDG_WLOCK(vpna->na_bdg); if (onoff) { for_rx_tx(t) { for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; if (nm_kring_pending_on(kring)) kring->nr_mode = NKR_NETMAP_ON; } } if (na->active_fds == 0) na->na_flags |= NAF_NETMAP_ON; /* XXX on FreeBSD, persistent VALE ports should also * toggle IFCAP_NETMAP in na->ifp (2014-03-16) */ } else { if (na->active_fds == 0) na->na_flags &= ~NAF_NETMAP_ON; for_rx_tx(t) { for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { struct netmap_kring *kring = &NMR(na, t)[i]; if (nm_kring_pending_off(kring)) kring->nr_mode = NKR_NETMAP_OFF; } } } if (vpna->na_bdg) BDG_WUNLOCK(vpna->na_bdg); return 0; } /* * Lookup function for a learning bridge. * Update the hash table with the source address, * and then returns the destination port index, and the * ring in *dst_ring (at the moment, always use ring 0) */ u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, struct netmap_vp_adapter *na) { uint8_t *buf = ft->ft_buf; u_int buf_len = ft->ft_len; struct nm_hash_ent *ht = na->na_bdg->ht; uint32_t sh, dh; u_int dst, mysrc = na->bdg_port; uint64_t smac, dmac; uint8_t indbuf[12]; /* safety check, unfortunately we have many cases */ if (buf_len >= 14 + na->up.virt_hdr_len) { /* virthdr + mac_hdr in the same slot */ buf += na->up.virt_hdr_len; buf_len -= na->up.virt_hdr_len; } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { /* only header in first fragment */ ft++; buf = ft->ft_buf; buf_len = ft->ft_len; } else { RD(5, "invalid buf format, length %d", buf_len); return NM_BDG_NOPORT; } if (ft->ft_flags & NS_INDIRECT) { if (copyin(buf, indbuf, sizeof(indbuf))) { return NM_BDG_NOPORT; } buf = indbuf; } dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; smac = le64toh(*(uint64_t *)(buf + 4)); smac >>= 16; /* * The hash is somewhat expensive, there might be some * worthwhile optimizations here. */ if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ uint8_t *s = buf+6; sh = nm_bridge_rthash(s); // XXX hash of source /* update source port forwarding entry */ na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ ht[sh].ports = mysrc; if (netmap_verbose) D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", s[0], s[1], s[2], s[3], s[4], s[5], mysrc); } dst = NM_BDG_BROADCAST; if ((buf[0] & 1) == 0) { /* unicast */ dh = nm_bridge_rthash(buf); // XXX hash of dst if (ht[dh].mac == dmac) { /* found dst */ dst = ht[dh].ports; } /* XXX otherwise return NM_BDG_UNKNOWN ? */ } return dst; } /* * Available space in the ring. Only used in VALE code * and only with is_rx = 1 */ static inline uint32_t nm_kr_space(struct netmap_kring *k, int is_rx) { int space; if (is_rx) { int busy = k->nkr_hwlease - k->nr_hwcur; if (busy < 0) busy += k->nkr_num_slots; space = k->nkr_num_slots - 1 - busy; } else { /* XXX never used in this branch */ space = k->nr_hwtail - k->nkr_hwlease; if (space < 0) space += k->nkr_num_slots; } #if 0 // sanity check if (k->nkr_hwlease >= k->nkr_num_slots || k->nr_hwcur >= k->nkr_num_slots || k->nr_tail >= k->nkr_num_slots || busy < 0 || busy >= k->nkr_num_slots) { D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, k->nkr_lease_idx, k->nkr_num_slots); } #endif return space; } /* make a lease on the kring for N positions. return the * lease index * XXX only used in VALE code and with is_rx = 1 */ static inline uint32_t nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) { uint32_t lim = k->nkr_num_slots - 1; uint32_t lease_idx = k->nkr_lease_idx; k->nkr_leases[lease_idx] = NR_NOSLOT; k->nkr_lease_idx = nm_next(lease_idx, lim); if (n > nm_kr_space(k, is_rx)) { D("invalid request for %d slots", n); panic("x"); } /* XXX verify that there are n slots */ k->nkr_hwlease += n; if (k->nkr_hwlease > lim) k->nkr_hwlease -= lim + 1; if (k->nkr_hwlease >= k->nkr_num_slots || k->nr_hwcur >= k->nkr_num_slots || k->nr_hwtail >= k->nkr_num_slots || k->nkr_lease_idx >= k->nkr_num_slots) { D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", k->na->name, k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, k->nkr_lease_idx, k->nkr_num_slots); } return lease_idx; } /* * * This flush routine supports only unicast and broadcast but a large * number of ports, and lets us replace the learn and dispatch functions. */ int nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, u_int ring_nr) { struct nm_bdg_q *dst_ents, *brddst; uint16_t num_dsts = 0, *dsts; struct nm_bridge *b = na->na_bdg; u_int i, me = na->bdg_port; /* * The work area (pointed by ft) is followed by an array of * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS * queues per port plus one for the broadcast traffic. * Then we have an array of destination indexes. */ dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); /* first pass: find a destination for each packet in the batch */ for (i = 0; likely(i < n); i += ft[i].ft_frags) { uint8_t dst_ring = ring_nr; /* default, same ring as origin */ uint16_t dst_port, d_i; struct nm_bdg_q *d; ND("slot %d frags %d", i, ft[i].ft_frags); /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) continue; dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); if (netmap_verbose > 255) RD(5, "slot %d port %d -> %d", i, me, dst_port); if (dst_port == NM_BDG_NOPORT) continue; /* this packet is identified to be dropped */ else if (unlikely(dst_port > NM_BDG_MAXPORTS)) continue; else if (dst_port == NM_BDG_BROADCAST) dst_ring = 0; /* broadcasts always go to ring 0 */ else if (unlikely(dst_port == me || !b->bdg_ports[dst_port])) continue; /* get a position in the scratch pad */ d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; d = dst_ents + d_i; /* append the first fragment to the list */ if (d->bq_head == NM_FT_NULL) { /* new destination */ d->bq_head = d->bq_tail = i; /* remember this position to be scanned later */ if (dst_port != NM_BDG_BROADCAST) dsts[num_dsts++] = d_i; } else { ft[d->bq_tail].ft_next = i; d->bq_tail = i; } d->bq_len += ft[i].ft_frags; } /* * Broadcast traffic goes to ring 0 on all destinations. * So we need to add these rings to the list of ports to scan. * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is * expensive. We should keep a compact list of active destinations * so we could shorten this loop. */ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; if (brddst->bq_head != NM_FT_NULL) { u_int j; for (j = 0; likely(j < b->bdg_active_ports); j++) { uint16_t d_i; i = b->bdg_port_index[j]; if (unlikely(i == me)) continue; d_i = i * NM_BDG_MAXRINGS; if (dst_ents[d_i].bq_head == NM_FT_NULL) dsts[num_dsts++] = d_i; } } ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); /* second pass: scan destinations */ for (i = 0; i < num_dsts; i++) { struct netmap_vp_adapter *dst_na; struct netmap_kring *kring; struct netmap_ring *ring; u_int dst_nr, lim, j, d_i, next, brd_next; u_int needed, howmany; int retry = netmap_txsync_retry; struct nm_bdg_q *d; uint32_t my_start = 0, lease_idx = 0; int nrings; int virt_hdr_mismatch = 0; d_i = dsts[i]; ND("second pass %d port %d", i, d_i); d = dst_ents + d_i; // XXX fix the division dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; /* protect from the lookup function returning an inactive * destination port */ if (unlikely(dst_na == NULL)) goto cleanup; if (dst_na->up.na_flags & NAF_SW_ONLY) goto cleanup; /* * The interface may be in !netmap mode in two cases: * - when na is attached but not activated yet; * - when na is being deactivated but is still attached. */ if (unlikely(!nm_netmap_on(&dst_na->up))) { ND("not in netmap mode!"); goto cleanup; } /* there is at least one either unicast or broadcast packet */ brd_next = brddst->bq_head; next = d->bq_head; /* we need to reserve this many slots. If fewer are * available, some packets will be dropped. * Packets may have multiple fragments, so we may not use * there is a chance that we may not use all of the slots * we have claimed, so we will need to handle the leftover * ones when we regain the lock. */ needed = d->bq_len + brddst->bq_len; if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, dst_na->up.virt_hdr_len); /* There is a virtio-net header/offloadings mismatch between * source and destination. The slower mismatch datapath will * be used to cope with all the mismatches. */ virt_hdr_mismatch = 1; if (dst_na->mfs < na->mfs) { /* We may need to do segmentation offloadings, and so * we may need a number of destination slots greater * than the number of input slots ('needed'). * We look for the smallest integer 'x' which satisfies: * needed * na->mfs + x * H <= x * na->mfs * where 'H' is the length of the longest header that may * be replicated in the segmentation process (e.g. for * TCPv4 we must account for ethernet header, IP header * and TCPv4 header). */ needed = (needed * na->mfs) / (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); } } ND(5, "pass 2 dst %d is %x %s", i, d_i, is_vp ? "virtual" : "nic/host"); dst_nr = d_i & (NM_BDG_MAXRINGS-1); nrings = dst_na->up.num_rx_rings; if (dst_nr >= nrings) dst_nr = dst_nr % nrings; kring = &dst_na->up.rx_rings[dst_nr]; ring = kring->ring; lim = kring->nkr_num_slots - 1; retry: if (dst_na->retry && retry) { /* try to get some free slot from the previous run */ kring->nm_notify(kring, 0); /* actually useful only for bwraps, since there * the notify will trigger a txsync on the hwna. VALE ports * have dst_na->retry == 0 */ } /* reserve the buffers in the queue and an entry * to report completion, and drop lock. * XXX this might become a helper function. */ mtx_lock(&kring->q_lock); if (kring->nkr_stopped) { mtx_unlock(&kring->q_lock); goto cleanup; } my_start = j = kring->nkr_hwlease; howmany = nm_kr_space(kring, 1); if (needed < howmany) howmany = needed; lease_idx = nm_kr_lease(kring, howmany, 1); mtx_unlock(&kring->q_lock); /* only retry if we need more than available slots */ if (retry && needed <= howmany) retry = 0; /* copy to the destination queue */ while (howmany > 0) { struct netmap_slot *slot; struct nm_bdg_fwd *ft_p, *ft_end; u_int cnt; /* find the queue from which we pick next packet. * NM_FT_NULL is always higher than valid indexes * so we never dereference it if the other list * has packets (and if both are empty we never * get here). */ if (next < brd_next) { ft_p = ft + next; next = ft_p->ft_next; } else { /* insert broadcast */ ft_p = ft + brd_next; brd_next = ft_p->ft_next; } cnt = ft_p->ft_frags; // cnt > 0 if (unlikely(cnt > howmany)) break; /* no more space */ if (netmap_verbose && cnt > 1) RD(5, "rx %d frags to %d", cnt, j); ft_end = ft_p + cnt; if (unlikely(virt_hdr_mismatch)) { bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); } else { howmany -= cnt; do { char *dst, *src = ft_p->ft_buf; size_t copy_len = ft_p->ft_len, dst_len = copy_len; slot = &ring->slot[j]; dst = NMB(&dst_na->up, slot); ND("send [%d] %d(%d) bytes at %s:%d", i, (int)copy_len, (int)dst_len, NM_IFPNAME(dst_ifp), j); /* round to a multiple of 64 */ copy_len = (copy_len + 63) & ~63; if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || copy_len > NETMAP_BUF_SIZE(&na->up))) { RD(5, "invalid len %d, down to 64", (int)copy_len); copy_len = dst_len = 64; // XXX } if (ft_p->ft_flags & NS_INDIRECT) { if (copyin(src, dst, copy_len)) { // invalid user pointer, pretend len is 0 dst_len = 0; } } else { //memcpy(dst, src, copy_len); pkt_copy(src, dst, (int)copy_len); } slot->len = dst_len; slot->flags = (cnt << 8)| NS_MOREFRAG; j = nm_next(j, lim); needed--; ft_p++; } while (ft_p != ft_end); slot->flags = (cnt << 8); /* clear flag on last entry */ } /* are we done ? */ if (next == NM_FT_NULL && brd_next == NM_FT_NULL) break; } { /* current position */ uint32_t *p = kring->nkr_leases; /* shorthand */ uint32_t update_pos; int still_locked = 1; mtx_lock(&kring->q_lock); if (unlikely(howmany > 0)) { /* not used all bufs. If i am the last one * i can recover the slots, otherwise must * fill them with 0 to mark empty packets. */ ND("leftover %d bufs", howmany); if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { /* yes i am the last one */ ND("roll back nkr_hwlease to %d", j); kring->nkr_hwlease = j; } else { while (howmany-- > 0) { ring->slot[j].len = 0; ring->slot[j].flags = 0; j = nm_next(j, lim); } } } p[lease_idx] = j; /* report I am done */ update_pos = kring->nr_hwtail; if (my_start == update_pos) { /* all slots before my_start have been reported, * so scan subsequent leases to see if other ranges * have been completed, and to a selwakeup or txsync. */ while (lease_idx != kring->nkr_lease_idx && p[lease_idx] != NR_NOSLOT) { j = p[lease_idx]; p[lease_idx] = NR_NOSLOT; lease_idx = nm_next(lease_idx, lim); } /* j is the new 'write' position. j != my_start * means there are new buffers to report */ if (likely(j != my_start)) { kring->nr_hwtail = j; still_locked = 0; mtx_unlock(&kring->q_lock); kring->nm_notify(kring, 0); /* this is netmap_notify for VALE ports and * netmap_bwrap_notify for bwrap. The latter will * trigger a txsync on the underlying hwna */ if (dst_na->retry && retry--) { /* XXX this is going to call nm_notify again. * Only useful for bwrap in virtual machines */ goto retry; } } } if (still_locked) mtx_unlock(&kring->q_lock); } cleanup: d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ d->bq_len = 0; } brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ brddst->bq_len = 0; return 0; } /* nm_txsync callback for VALE ports */ static int netmap_vp_txsync(struct netmap_kring *kring, int flags) { struct netmap_vp_adapter *na = (struct netmap_vp_adapter *)kring->na; u_int done; u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; if (bridge_batch <= 0) { /* testing only */ done = head; // used all goto done; } if (!na->na_bdg) { done = head; goto done; } if (bridge_batch > NM_BDG_BATCH) bridge_batch = NM_BDG_BATCH; done = nm_bdg_preflush(kring, head); done: if (done != head) D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); /* * packets between 'done' and 'cur' are left unsent. */ kring->nr_hwcur = done; kring->nr_hwtail = nm_prev(done, lim); if (netmap_verbose) D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); return 0; } /* rxsync code used by VALE ports nm_rxsync callback and also * internally by the brwap */ static int netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; u_int nm_i, lim = kring->nkr_num_slots - 1; u_int head = kring->rhead; int n; if (head > lim) { D("ouch dangerous reset!!!"); n = netmap_ring_reinit(kring); goto done; } /* First part, import newly received packets. */ /* actually nothing to do here, they are already in the kring */ /* Second part, skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; if (nm_i != head) { /* consistency check, but nothing really important here */ for (n = 0; likely(nm_i != head); n++) { struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = NMB(na, slot); if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ D("bad buffer index %d, ignore ?", slot->buf_idx); } slot->flags &= ~NS_BUF_CHANGED; nm_i = nm_next(nm_i, lim); } kring->nr_hwcur = head; } n = 0; done: return n; } /* * nm_rxsync callback for VALE ports * user process reading from a VALE switch. * Already protected against concurrent calls from userspace, * but we must acquire the queue's lock to protect against * writers on the same queue. */ static int netmap_vp_rxsync(struct netmap_kring *kring, int flags) { int n; mtx_lock(&kring->q_lock); n = netmap_vp_rxsync_locked(kring, flags); mtx_unlock(&kring->q_lock); return n; } /* nm_bdg_attach callback for VALE ports * The na_vp port is this same netmap_adapter. There is no host port. */ static int netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; if (vpna->na_bdg) return EBUSY; na->na_vp = vpna; strncpy(na->name, name, sizeof(na->name)); na->na_hostvp = NULL; return 0; } /* create a netmap_vp_adapter that describes a VALE port. * Only persistent VALE ports have a non-null ifp. */ static int netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret) { struct netmap_vp_adapter *vpna; struct netmap_adapter *na; int error; u_int npipes = 0; vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); if (vpna == NULL) return ENOMEM; na = &vpna->up; na->ifp = ifp; strncpy(na->name, nmr->nr_name, sizeof(na->name)); /* bound checking */ na->num_tx_rings = nmr->nr_tx_rings; nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); nmr->nr_tx_rings = na->num_tx_rings; // write back na->num_rx_rings = nmr->nr_rx_rings; nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); nmr->nr_rx_rings = na->num_rx_rings; // write back nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1, NM_BDG_MAXSLOTS, NULL); na->num_tx_desc = nmr->nr_tx_slots; nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1, NM_BDG_MAXSLOTS, NULL); /* validate number of pipes. We want at least 1, * but probably can do with some more. * So let's use 2 as default (when 0 is supplied) */ npipes = nmr->nr_arg1; nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); nmr->nr_arg1 = npipes; /* write back */ /* validate extra bufs */ nm_bound_var(&nmr->nr_arg3, 0, 0, 128*NM_BDG_MAXSLOTS, NULL); na->num_rx_desc = nmr->nr_rx_slots; vpna->mfs = 1514; vpna->last_smac = ~0llu; /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? vpna->mfs = netmap_buf_size; */ if (netmap_verbose) D("max frame size %u", vpna->mfs); na->na_flags |= NAF_BDG_MAYSLEEP; /* persistent VALE ports look like hw devices * with a native netmap adapter */ if (ifp) na->na_flags |= NAF_NATIVE; na->nm_txsync = netmap_vp_txsync; na->nm_rxsync = netmap_vp_rxsync; na->nm_register = netmap_vp_reg; na->nm_krings_create = netmap_vp_krings_create; na->nm_krings_delete = netmap_vp_krings_delete; na->nm_dtor = netmap_vp_dtor; na->nm_mem = netmap_mem_private_new(na->name, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc, nmr->nr_arg3, npipes, &error); if (na->nm_mem == NULL) goto err; na->nm_bdg_attach = netmap_vp_bdg_attach; /* other nmd fields are set in the common routine */ error = netmap_attach_common(na); if (error) goto err; *ret = vpna; return 0; err: if (na->nm_mem != NULL) netmap_mem_delete(na->nm_mem); free(vpna, M_DEVBUF); return error; } /* Bridge wrapper code (bwrap). * This is used to connect a non-VALE-port netmap_adapter (hwna) to a * VALE switch. * The main task is to swap the meaning of tx and rx rings to match the * expectations of the VALE switch code (see nm_bdg_flush). * * The bwrap works by interposing a netmap_bwrap_adapter between the * rest of the system and the hwna. The netmap_bwrap_adapter looks like * a netmap_vp_adapter to the rest the system, but, internally, it * translates all callbacks to what the hwna expects. * * Note that we have to intercept callbacks coming from two sides: * * - callbacks coming from the netmap module are intercepted by * passing around the netmap_bwrap_adapter instead of the hwna * * - callbacks coming from outside of the netmap module only know * about the hwna. This, however, only happens in interrupt * handlers, where only the hwna->nm_notify callback is called. * What the bwrap does is to overwrite the hwna->nm_notify callback * with its own netmap_bwrap_intr_notify. * XXX This assumes that the hwna->nm_notify callback was the * standard netmap_notify(), as it is the case for nic adapters. * Any additional action performed by hwna->nm_notify will not be * performed by netmap_bwrap_intr_notify. * * Additionally, the bwrap can optionally attach the host rings pair * of the wrapped adapter to a different port of the switch. */ static void netmap_bwrap_dtor(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; struct netmap_adapter *hwna = bna->hwna; struct nm_bridge *b = bna->up.na_bdg, *bh = bna->host.na_bdg; if (b) { netmap_bdg_detach_common(b, bna->up.bdg_port, (bh ? bna->host.bdg_port : -1)); } ND("na %p", na); na->ifp = NULL; bna->host.up.ifp = NULL; hwna->na_private = NULL; hwna->na_vp = hwna->na_hostvp = NULL; hwna->na_flags &= ~NAF_BUSY; netmap_adapter_put(hwna); } /* * Intr callback for NICs connected to a bridge. * Simply ignore tx interrupts (maybe we could try to recover space ?) * and pass received packets from nic to the bridge. * * XXX TODO check locking: this is called from the interrupt * handler so we should make sure that the interface is not * disconnected while passing down an interrupt. * * Note, no user process can access this NIC or the host stack. * The only part of the ring that is significant are the slots, * and head/cur/tail are set from the kring as needed * (part as a receive ring, part as a transmit ring). * * callback that overwrites the hwna notify callback. * Packets come from the outside or from the host stack and are put on an * hwna rx ring. * The bridge wrapper then sends the packets through the bridge. */ static int netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_bwrap_adapter *bna = na->na_private; struct netmap_kring *bkring; struct netmap_vp_adapter *vpna = &bna->up; u_int ring_nr = kring->ring_id; int ret = NM_IRQ_COMPLETED; int error; if (netmap_verbose) D("%s %s 0x%x", na->name, kring->name, flags); bkring = &vpna->up.tx_rings[ring_nr]; /* make sure the ring is not disabled */ if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { return EIO; } if (netmap_verbose) D("%s head %d cur %d tail %d", na->name, kring->rhead, kring->rcur, kring->rtail); /* simulate a user wakeup on the rx ring * fetch packets that have arrived. */ error = kring->nm_sync(kring, 0); if (error) goto put_out; if (kring->nr_hwcur == kring->nr_hwtail) { if (netmap_verbose) D("how strange, interrupt with no packets on %s", na->name); goto put_out; } /* new packets are kring->rcur to kring->nr_hwtail, and the bkring * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail * to push all packets out. */ bkring->rhead = bkring->rcur = kring->nr_hwtail; netmap_vp_txsync(bkring, flags); /* mark all buffers as released on this ring */ kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; /* another call to actually release the buffers */ error = kring->nm_sync(kring, 0); /* The second rxsync may have further advanced hwtail. If this happens, * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ if (kring->rcur != kring->nr_hwtail) { ret = NM_IRQ_RESCHED; } put_out: nm_kr_put(kring); return error ? error : ret; } /* nm_register callback for bwrap */ static int netmap_bwrap_reg(struct netmap_adapter *na, int onoff) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; struct netmap_vp_adapter *hostna = &bna->host; int error, i; enum txrx t; ND("%s %s", na->name, onoff ? "on" : "off"); if (onoff) { /* netmap_do_regif has been called on the bwrap na. * We need to pass the information about the * memory allocator down to the hwna before * putting it in netmap mode */ hwna->na_lut = na->na_lut; if (hostna->na_bdg) { /* if the host rings have been attached to switch, * we need to copy the memory allocator information * in the hostna also */ hostna->up.na_lut = na->na_lut; } /* cross-link the netmap rings * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. */ for_rx_tx(t) { enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { NMR(hwna, r)[i].ring = NMR(na, t)[i].ring; } } if (na->na_flags & NAF_HOST_RINGS) { struct netmap_adapter *hna = &hostna->up; /* the hostna rings are the host rings of the bwrap. * The corresponding krings must point back to the * hostna */ hna->tx_rings = &na->tx_rings[na->num_tx_rings]; hna->tx_rings[0].na = hna; hna->rx_rings = &na->rx_rings[na->num_rx_rings]; hna->rx_rings[0].na = hna; } } /* pass down the pending ring state information */ for_rx_tx(t) { for (i = 0; i < nma_get_nrings(na, t) + 1; i++) NMR(hwna, t)[i].nr_pending_mode = NMR(na, t)[i].nr_pending_mode; } /* forward the request to the hwna */ error = hwna->nm_register(hwna, onoff); if (error) return error; /* copy up the current ring state information */ for_rx_tx(t) { for (i = 0; i < nma_get_nrings(na, t) + 1; i++) NMR(na, t)[i].nr_mode = NMR(hwna, t)[i].nr_mode; } /* impersonate a netmap_vp_adapter */ netmap_vp_reg(na, onoff); if (hostna->na_bdg) netmap_vp_reg(&hostna->up, onoff); if (onoff) { u_int i; /* intercept the hwna nm_nofify callback on the hw rings */ for (i = 0; i < hwna->num_rx_rings; i++) { hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; } i = hwna->num_rx_rings; /* for safety */ /* save the host ring notify unconditionally */ hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; if (hostna->na_bdg) { /* also intercept the host ring notify */ hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; } if (na->active_fds == 0) na->na_flags |= NAF_NETMAP_ON; } else { u_int i; if (na->active_fds == 0) na->na_flags &= ~NAF_NETMAP_ON; /* reset all notify callbacks (including host ring) */ for (i = 0; i <= hwna->num_rx_rings; i++) { hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; hwna->rx_rings[i].save_notify = NULL; } hwna->na_lut.lut = NULL; hwna->na_lut.objtotal = 0; hwna->na_lut.objsize = 0; } return 0; } /* nm_config callback for bwrap */ static int netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; /* forward the request */ netmap_update_config(hwna); /* swap the results */ *txr = hwna->num_rx_rings; *txd = hwna->num_rx_desc; *rxr = hwna->num_tx_rings; *rxd = hwna->num_rx_desc; return 0; } /* nm_krings_create callback for bwrap */ static int netmap_bwrap_krings_create(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; int i, error = 0; enum txrx t; ND("%s", na->name); /* impersonate a netmap_vp_adapter */ error = netmap_vp_krings_create(na); if (error) return error; /* also create the hwna krings */ error = hwna->nm_krings_create(hwna); if (error) { goto err_del_vp_rings; } /* get each ring slot number from the corresponding hwna ring */ for_rx_tx(t) { enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; } } return 0; err_del_vp_rings: netmap_vp_krings_delete(na); return error; } static void netmap_bwrap_krings_delete(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; ND("%s", na->name); hwna->nm_krings_delete(hwna); netmap_vp_krings_delete(na); } /* notify method for the bridge-->hwna direction */ static int netmap_bwrap_notify(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_bwrap_adapter *bna = na->na_private; struct netmap_adapter *hwna = bna->hwna; u_int ring_n = kring->ring_id; u_int lim = kring->nkr_num_slots - 1; struct netmap_kring *hw_kring; int error; ND("%s: na %s hwna %s", (kring ? kring->name : "NULL!"), (na ? na->name : "NULL!"), (hwna ? hwna->name : "NULL!")); hw_kring = &hwna->tx_rings[ring_n]; if (nm_kr_tryget(hw_kring, 0, NULL)) { return ENXIO; } /* first step: simulate a user wakeup on the rx ring */ netmap_vp_rxsync(kring, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", na->name, ring_n, kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); /* second step: the new packets are sent on the tx ring * (which is actually the same ring) */ hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; error = hw_kring->nm_sync(hw_kring, flags); if (error) goto put_out; /* third step: now we are back the rx ring */ /* claim ownership on all hw owned bufs */ kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ /* fourth step: the user goes to sleep again, causing another rxsync */ netmap_vp_rxsync(kring, flags); ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", na->name, ring_n, kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); put_out: nm_kr_put(hw_kring); return error ? error : NM_IRQ_COMPLETED; } /* nm_bdg_ctl callback for the bwrap. * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. * On attach, it needs to provide a fake netmap_priv_d structure and * perform a netmap_do_regif() on the bwrap. This will put both the * bwrap and the hwna in netmap mode, with the netmap rings shared * and cross linked. Moroever, it will start intercepting interrupts * directed to hwna. */ static int netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) { struct netmap_priv_d *npriv; struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; int error = 0; if (attach) { if (NETMAP_OWNED_BY_ANY(na)) { return EBUSY; } if (bna->na_kpriv) { /* nothing to do */ return 0; } npriv = netmap_priv_new(); if (npriv == NULL) return ENOMEM; npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW); if (error) { netmap_priv_delete(npriv); return error; } bna->na_kpriv = npriv; na->na_flags |= NAF_BUSY; } else { if (na->active_fds == 0) /* not registered */ return EINVAL; netmap_priv_delete(bna->na_kpriv); bna->na_kpriv = NULL; na->na_flags &= ~NAF_BUSY; } return error; } /* attach a bridge wrapper to the 'real' device */ int netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) { struct netmap_bwrap_adapter *bna; struct netmap_adapter *na = NULL; struct netmap_adapter *hostna = NULL; int error = 0; enum txrx t; /* make sure the NIC is not already in use */ if (NETMAP_OWNED_BY_ANY(hwna)) { D("NIC %s busy, cannot attach to bridge", hwna->name); return EBUSY; } bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); if (bna == NULL) { return ENOMEM; } na = &bna->up.up; /* make bwrap ifp point to the real ifp */ na->ifp = hwna->ifp; na->na_private = bna; strncpy(na->name, nr_name, sizeof(na->name)); /* fill the ring data for the bwrap adapter with rx/tx meanings * swapped. The real cross-linking will be done during register, * when all the krings will have been created. */ for_rx_tx(t) { enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ nma_set_nrings(na, t, nma_get_nrings(hwna, r)); nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); } na->nm_dtor = netmap_bwrap_dtor; na->nm_register = netmap_bwrap_reg; // na->nm_txsync = netmap_bwrap_txsync; // na->nm_rxsync = netmap_bwrap_rxsync; na->nm_config = netmap_bwrap_config; na->nm_krings_create = netmap_bwrap_krings_create; na->nm_krings_delete = netmap_bwrap_krings_delete; na->nm_notify = netmap_bwrap_notify; na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; na->pdev = hwna->pdev; na->nm_mem = hwna->nm_mem; na->virt_hdr_len = hwna->virt_hdr_len; bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ bna->hwna = hwna; netmap_adapter_get(hwna); hwna->na_private = bna; /* weak reference */ hwna->na_vp = &bna->up; if (hwna->na_flags & NAF_HOST_RINGS) { if (hwna->na_flags & NAF_SW_ONLY) na->na_flags |= NAF_SW_ONLY; na->na_flags |= NAF_HOST_RINGS; hostna = &bna->host.up; snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); hostna->ifp = hwna->ifp; for_rx_tx(t) { enum txrx r = nm_txrx_swap(t); nma_set_nrings(hostna, t, 1); nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); } // hostna->nm_txsync = netmap_bwrap_host_txsync; // hostna->nm_rxsync = netmap_bwrap_host_rxsync; hostna->nm_notify = netmap_bwrap_notify; hostna->nm_mem = na->nm_mem; hostna->na_private = bna; hostna->na_vp = &bna->up; na->na_hostvp = hwna->na_hostvp = hostna->na_hostvp = &bna->host; hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ } ND("%s<->%s txr %d txd %d rxr %d rxd %d", na->name, ifp->if_xname, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); error = netmap_attach_common(na); if (error) { goto err_free; } hwna->na_flags |= NAF_BUSY; return 0; err_free: hwna->na_vp = hwna->na_hostvp = NULL; netmap_adapter_put(hwna); free(bna, M_DEVBUF); return error; } struct nm_bridge * netmap_init_bridges2(u_int n) { int i; struct nm_bridge *b; b = malloc(sizeof(struct nm_bridge) * n, M_DEVBUF, M_NOWAIT | M_ZERO); if (b == NULL) return NULL; for (i = 0; i < n; i++) BDG_RWINIT(&b[i]); return b; } void netmap_uninit_bridges2(struct nm_bridge *b, u_int n) { int i; if (b == NULL) return; for (i = 0; i < n; i++) BDG_RWDESTROY(&b[i]); free(b, M_DEVBUF); } int netmap_init_bridges(void) { #ifdef CONFIG_NET_NS return netmap_bns_register(); #else nm_bridges = netmap_init_bridges2(NM_BRIDGES); if (nm_bridges == NULL) return ENOMEM; return 0; #endif } void netmap_uninit_bridges(void) { #ifdef CONFIG_NET_NS netmap_bns_unregister(); #else netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); #endif } #endif /* WITH_VALE */ Index: user/alc/PQ_LAUNDRY/sys/dev/re/if_re.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/re/if_re.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/dev/re/if_re.c (revision 308054) @@ -1,4076 +1,4078 @@ /*- * Copyright (c) 1997, 1998-2003 * Bill Paul . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Bill Paul. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Bill Paul AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * RealTek 8139C+/8169/8169S/8110S/8168/8111/8101E PCI NIC driver * * Written by Bill Paul * Senior Networking Software Engineer * Wind River Systems */ /* * This driver is designed to support RealTek's next generation of * 10/100 and 10/100/1000 PCI ethernet controllers. There are currently * seven devices in this family: the RTL8139C+, the RTL8169, the RTL8169S, * RTL8110S, the RTL8168, the RTL8111 and the RTL8101E. * * The 8139C+ is a 10/100 ethernet chip. It is backwards compatible * with the older 8139 family, however it also supports a special * C+ mode of operation that provides several new performance enhancing * features. These include: * * o Descriptor based DMA mechanism. Each descriptor represents * a single packet fragment. Data buffers may be aligned on * any byte boundary. * * o 64-bit DMA * * o TCP/IP checksum offload for both RX and TX * * o High and normal priority transmit DMA rings * * o VLAN tag insertion and extraction * * o TCP large send (segmentation offload) * * Like the 8139, the 8139C+ also has a built-in 10/100 PHY. The C+ * programming API is fairly straightforward. The RX filtering, EEPROM * access and PHY access is the same as it is on the older 8139 series * chips. * * The 8169 is a 64-bit 10/100/1000 gigabit ethernet MAC. It has almost the * same programming API and feature set as the 8139C+ with the following * differences and additions: * * o 1000Mbps mode * * o Jumbo frames * * o GMII and TBI ports/registers for interfacing with copper * or fiber PHYs * * o RX and TX DMA rings can have up to 1024 descriptors * (the 8139C+ allows a maximum of 64) * * o Slight differences in register layout from the 8139C+ * * The TX start and timer interrupt registers are at different locations * on the 8169 than they are on the 8139C+. Also, the status word in the * RX descriptor has a slightly different bit layout. The 8169 does not * have a built-in PHY. Most reference boards use a Marvell 88E1000 'Alaska' * copper gigE PHY. * * The 8169S/8110S 10/100/1000 devices have built-in copper gigE PHYs * (the 'S' stands for 'single-chip'). These devices have the same * programming API as the older 8169, but also have some vendor-specific * registers for the on-board PHY. The 8110S is a LAN-on-motherboard * part designed to be pin-compatible with the RealTek 8100 10/100 chip. * * This driver takes advantage of the RX and TX checksum offload and * VLAN tag insertion/extraction features. It also implements TX * interrupt moderation using the timer interrupt registers, which * significantly reduces TX interrupt load. There is also support * for jumbo frames, however the 8169/8169S/8110S can not transmit * jumbo frames larger than 7440, so the max MTU possible with this * driver is 7422 bytes. */ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_DEPEND(re, pci, 1, 1, 1); MODULE_DEPEND(re, ether, 1, 1, 1); MODULE_DEPEND(re, miibus, 1, 1, 1); /* "device miibus" required. See GENERIC if you get errors here. */ #include "miibus_if.h" /* Tunables. */ static int intr_filter = 0; TUNABLE_INT("hw.re.intr_filter", &intr_filter); static int msi_disable = 0; TUNABLE_INT("hw.re.msi_disable", &msi_disable); static int msix_disable = 0; TUNABLE_INT("hw.re.msix_disable", &msix_disable); static int prefer_iomap = 0; TUNABLE_INT("hw.re.prefer_iomap", &prefer_iomap); #define RE_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP) /* * Various supported device vendors/types and their names. */ static const struct rl_type re_devs[] = { { DLINK_VENDORID, DLINK_DEVICEID_528T, 0, "D-Link DGE-528(T) Gigabit Ethernet Adapter" }, { DLINK_VENDORID, DLINK_DEVICEID_530T_REVC, 0, "D-Link DGE-530(T) Gigabit Ethernet Adapter" }, { RT_VENDORID, RT_DEVICEID_8139, 0, "RealTek 8139C+ 10/100BaseTX" }, { RT_VENDORID, RT_DEVICEID_8101E, 0, "RealTek 810xE PCIe 10/100baseTX" }, { RT_VENDORID, RT_DEVICEID_8168, 0, "RealTek 8168/8111 B/C/CP/D/DP/E/F/G PCIe Gigabit Ethernet" }, { NCUBE_VENDORID, RT_DEVICEID_8168, 0, "TP-Link TG-3468 v2 (RTL8168) Gigabit Ethernet" }, { RT_VENDORID, RT_DEVICEID_8169, 0, "RealTek 8169/8169S/8169SB(L)/8110S/8110SB(L) Gigabit Ethernet" }, { RT_VENDORID, RT_DEVICEID_8169SC, 0, "RealTek 8169SC/8110SC Single-chip Gigabit Ethernet" }, { COREGA_VENDORID, COREGA_DEVICEID_CGLAPCIGT, 0, "Corega CG-LAPCIGT (RTL8169S) Gigabit Ethernet" }, { LINKSYS_VENDORID, LINKSYS_DEVICEID_EG1032, 0, "Linksys EG1032 (RTL8169S) Gigabit Ethernet" }, { USR_VENDORID, USR_DEVICEID_997902, 0, "US Robotics 997902 (RTL8169S) Gigabit Ethernet" } }; static const struct rl_hwrev re_hwrevs[] = { { RL_HWREV_8139, RL_8139, "", RL_MTU }, { RL_HWREV_8139A, RL_8139, "A", RL_MTU }, { RL_HWREV_8139AG, RL_8139, "A-G", RL_MTU }, { RL_HWREV_8139B, RL_8139, "B", RL_MTU }, { RL_HWREV_8130, RL_8139, "8130", RL_MTU }, { RL_HWREV_8139C, RL_8139, "C", RL_MTU }, { RL_HWREV_8139D, RL_8139, "8139D/8100B/8100C", RL_MTU }, { RL_HWREV_8139CPLUS, RL_8139CPLUS, "C+", RL_MTU }, { RL_HWREV_8168B_SPIN1, RL_8169, "8168", RL_JUMBO_MTU }, { RL_HWREV_8169, RL_8169, "8169", RL_JUMBO_MTU }, { RL_HWREV_8169S, RL_8169, "8169S", RL_JUMBO_MTU }, { RL_HWREV_8110S, RL_8169, "8110S", RL_JUMBO_MTU }, { RL_HWREV_8169_8110SB, RL_8169, "8169SB/8110SB", RL_JUMBO_MTU }, { RL_HWREV_8169_8110SC, RL_8169, "8169SC/8110SC", RL_JUMBO_MTU }, { RL_HWREV_8169_8110SBL, RL_8169, "8169SBL/8110SBL", RL_JUMBO_MTU }, { RL_HWREV_8169_8110SCE, RL_8169, "8169SC/8110SC", RL_JUMBO_MTU }, { RL_HWREV_8100, RL_8139, "8100", RL_MTU }, { RL_HWREV_8101, RL_8139, "8101", RL_MTU }, { RL_HWREV_8100E, RL_8169, "8100E", RL_MTU }, { RL_HWREV_8101E, RL_8169, "8101E", RL_MTU }, { RL_HWREV_8102E, RL_8169, "8102E", RL_MTU }, { RL_HWREV_8102EL, RL_8169, "8102EL", RL_MTU }, { RL_HWREV_8102EL_SPIN1, RL_8169, "8102EL", RL_MTU }, { RL_HWREV_8103E, RL_8169, "8103E", RL_MTU }, { RL_HWREV_8401E, RL_8169, "8401E", RL_MTU }, { RL_HWREV_8402, RL_8169, "8402", RL_MTU }, { RL_HWREV_8105E, RL_8169, "8105E", RL_MTU }, { RL_HWREV_8105E_SPIN1, RL_8169, "8105E", RL_MTU }, { RL_HWREV_8106E, RL_8169, "8106E", RL_MTU }, { RL_HWREV_8168B_SPIN2, RL_8169, "8168", RL_JUMBO_MTU }, { RL_HWREV_8168B_SPIN3, RL_8169, "8168", RL_JUMBO_MTU }, { RL_HWREV_8168C, RL_8169, "8168C/8111C", RL_JUMBO_MTU_6K }, { RL_HWREV_8168C_SPIN2, RL_8169, "8168C/8111C", RL_JUMBO_MTU_6K }, { RL_HWREV_8168CP, RL_8169, "8168CP/8111CP", RL_JUMBO_MTU_6K }, { RL_HWREV_8168D, RL_8169, "8168D/8111D", RL_JUMBO_MTU_9K }, { RL_HWREV_8168DP, RL_8169, "8168DP/8111DP", RL_JUMBO_MTU_9K }, { RL_HWREV_8168E, RL_8169, "8168E/8111E", RL_JUMBO_MTU_9K}, { RL_HWREV_8168E_VL, RL_8169, "8168E/8111E-VL", RL_JUMBO_MTU_6K}, { RL_HWREV_8168EP, RL_8169, "8168EP/8111EP", RL_JUMBO_MTU_9K}, { RL_HWREV_8168F, RL_8169, "8168F/8111F", RL_JUMBO_MTU_9K}, { RL_HWREV_8168G, RL_8169, "8168G/8111G", RL_JUMBO_MTU_9K}, { RL_HWREV_8168GU, RL_8169, "8168GU/8111GU", RL_JUMBO_MTU_9K}, { RL_HWREV_8168H, RL_8169, "8168H/8111H", RL_JUMBO_MTU_9K}, { RL_HWREV_8411, RL_8169, "8411", RL_JUMBO_MTU_9K}, { RL_HWREV_8411B, RL_8169, "8411B", RL_JUMBO_MTU_9K}, { 0, 0, NULL, 0 } }; static int re_probe (device_t); static int re_attach (device_t); static int re_detach (device_t); static int re_encap (struct rl_softc *, struct mbuf **); static void re_dma_map_addr (void *, bus_dma_segment_t *, int, int); static int re_allocmem (device_t, struct rl_softc *); static __inline void re_discard_rxbuf (struct rl_softc *, int); static int re_newbuf (struct rl_softc *, int); static int re_jumbo_newbuf (struct rl_softc *, int); static int re_rx_list_init (struct rl_softc *); static int re_jrx_list_init (struct rl_softc *); static int re_tx_list_init (struct rl_softc *); #ifdef RE_FIXUP_RX static __inline void re_fixup_rx (struct mbuf *); #endif static int re_rxeof (struct rl_softc *, int *); static void re_txeof (struct rl_softc *); #ifdef DEVICE_POLLING static int re_poll (struct ifnet *, enum poll_cmd, int); static int re_poll_locked (struct ifnet *, enum poll_cmd, int); #endif static int re_intr (void *); static void re_intr_msi (void *); static void re_tick (void *); static void re_int_task (void *, int); static void re_start (struct ifnet *); static void re_start_locked (struct ifnet *); static int re_ioctl (struct ifnet *, u_long, caddr_t); static void re_init (void *); static void re_init_locked (struct rl_softc *); static void re_stop (struct rl_softc *); static void re_watchdog (struct rl_softc *); static int re_suspend (device_t); static int re_resume (device_t); static int re_shutdown (device_t); static int re_ifmedia_upd (struct ifnet *); static void re_ifmedia_sts (struct ifnet *, struct ifmediareq *); static void re_eeprom_putbyte (struct rl_softc *, int); static void re_eeprom_getword (struct rl_softc *, int, u_int16_t *); static void re_read_eeprom (struct rl_softc *, caddr_t, int, int); static int re_gmii_readreg (device_t, int, int); static int re_gmii_writereg (device_t, int, int, int); static int re_miibus_readreg (device_t, int, int); static int re_miibus_writereg (device_t, int, int, int); static void re_miibus_statchg (device_t); static void re_set_jumbo (struct rl_softc *, int); static void re_set_rxmode (struct rl_softc *); static void re_reset (struct rl_softc *); static void re_setwol (struct rl_softc *); static void re_clrwol (struct rl_softc *); static void re_set_linkspeed (struct rl_softc *); #ifdef DEV_NETMAP /* see ixgbe.c for details */ #include MODULE_DEPEND(re, netmap, 1, 1, 1); #endif /* !DEV_NETMAP */ #ifdef RE_DIAG static int re_diag (struct rl_softc *); #endif static void re_add_sysctls (struct rl_softc *); static int re_sysctl_stats (SYSCTL_HANDLER_ARGS); static int sysctl_int_range (SYSCTL_HANDLER_ARGS, int, int); static int sysctl_hw_re_int_mod (SYSCTL_HANDLER_ARGS); static device_method_t re_methods[] = { /* Device interface */ DEVMETHOD(device_probe, re_probe), DEVMETHOD(device_attach, re_attach), DEVMETHOD(device_detach, re_detach), DEVMETHOD(device_suspend, re_suspend), DEVMETHOD(device_resume, re_resume), DEVMETHOD(device_shutdown, re_shutdown), /* MII interface */ DEVMETHOD(miibus_readreg, re_miibus_readreg), DEVMETHOD(miibus_writereg, re_miibus_writereg), DEVMETHOD(miibus_statchg, re_miibus_statchg), DEVMETHOD_END }; static driver_t re_driver = { "re", re_methods, sizeof(struct rl_softc) }; static devclass_t re_devclass; DRIVER_MODULE(re, pci, re_driver, re_devclass, 0, 0); DRIVER_MODULE(miibus, re, miibus_driver, miibus_devclass, 0, 0); #define EE_SET(x) \ CSR_WRITE_1(sc, RL_EECMD, \ CSR_READ_1(sc, RL_EECMD) | x) #define EE_CLR(x) \ CSR_WRITE_1(sc, RL_EECMD, \ CSR_READ_1(sc, RL_EECMD) & ~x) /* * Send a read command and address to the EEPROM, check for ACK. */ static void re_eeprom_putbyte(struct rl_softc *sc, int addr) { int d, i; d = addr | (RL_9346_READ << sc->rl_eewidth); /* * Feed in each bit and strobe the clock. */ for (i = 1 << (sc->rl_eewidth + 3); i; i >>= 1) { if (d & i) { EE_SET(RL_EE_DATAIN); } else { EE_CLR(RL_EE_DATAIN); } DELAY(100); EE_SET(RL_EE_CLK); DELAY(150); EE_CLR(RL_EE_CLK); DELAY(100); } } /* * Read a word of data stored in the EEPROM at address 'addr.' */ static void re_eeprom_getword(struct rl_softc *sc, int addr, u_int16_t *dest) { int i; u_int16_t word = 0; /* * Send address of word we want to read. */ re_eeprom_putbyte(sc, addr); /* * Start reading bits from EEPROM. */ for (i = 0x8000; i; i >>= 1) { EE_SET(RL_EE_CLK); DELAY(100); if (CSR_READ_1(sc, RL_EECMD) & RL_EE_DATAOUT) word |= i; EE_CLR(RL_EE_CLK); DELAY(100); } *dest = word; } /* * Read a sequence of words from the EEPROM. */ static void re_read_eeprom(struct rl_softc *sc, caddr_t dest, int off, int cnt) { int i; u_int16_t word = 0, *ptr; CSR_SETBIT_1(sc, RL_EECMD, RL_EEMODE_PROGRAM); DELAY(100); for (i = 0; i < cnt; i++) { CSR_SETBIT_1(sc, RL_EECMD, RL_EE_SEL); re_eeprom_getword(sc, off + i, &word); CSR_CLRBIT_1(sc, RL_EECMD, RL_EE_SEL); ptr = (u_int16_t *)(dest + (i * 2)); *ptr = word; } CSR_CLRBIT_1(sc, RL_EECMD, RL_EEMODE_PROGRAM); } static int re_gmii_readreg(device_t dev, int phy, int reg) { struct rl_softc *sc; u_int32_t rval; int i; sc = device_get_softc(dev); /* Let the rgephy driver read the GMEDIASTAT register */ if (reg == RL_GMEDIASTAT) { rval = CSR_READ_1(sc, RL_GMEDIASTAT); return (rval); } CSR_WRITE_4(sc, RL_PHYAR, reg << 16); for (i = 0; i < RL_PHY_TIMEOUT; i++) { rval = CSR_READ_4(sc, RL_PHYAR); if (rval & RL_PHYAR_BUSY) break; DELAY(25); } if (i == RL_PHY_TIMEOUT) { device_printf(sc->rl_dev, "PHY read failed\n"); return (0); } /* * Controller requires a 20us delay to process next MDIO request. */ DELAY(20); return (rval & RL_PHYAR_PHYDATA); } static int re_gmii_writereg(device_t dev, int phy, int reg, int data) { struct rl_softc *sc; u_int32_t rval; int i; sc = device_get_softc(dev); CSR_WRITE_4(sc, RL_PHYAR, (reg << 16) | (data & RL_PHYAR_PHYDATA) | RL_PHYAR_BUSY); for (i = 0; i < RL_PHY_TIMEOUT; i++) { rval = CSR_READ_4(sc, RL_PHYAR); if (!(rval & RL_PHYAR_BUSY)) break; DELAY(25); } if (i == RL_PHY_TIMEOUT) { device_printf(sc->rl_dev, "PHY write failed\n"); return (0); } /* * Controller requires a 20us delay to process next MDIO request. */ DELAY(20); return (0); } static int re_miibus_readreg(device_t dev, int phy, int reg) { struct rl_softc *sc; u_int16_t rval = 0; u_int16_t re8139_reg = 0; sc = device_get_softc(dev); if (sc->rl_type == RL_8169) { rval = re_gmii_readreg(dev, phy, reg); return (rval); } switch (reg) { case MII_BMCR: re8139_reg = RL_BMCR; break; case MII_BMSR: re8139_reg = RL_BMSR; break; case MII_ANAR: re8139_reg = RL_ANAR; break; case MII_ANER: re8139_reg = RL_ANER; break; case MII_ANLPAR: re8139_reg = RL_LPAR; break; case MII_PHYIDR1: case MII_PHYIDR2: return (0); /* * Allow the rlphy driver to read the media status * register. If we have a link partner which does not * support NWAY, this is the register which will tell * us the results of parallel detection. */ case RL_MEDIASTAT: rval = CSR_READ_1(sc, RL_MEDIASTAT); return (rval); default: device_printf(sc->rl_dev, "bad phy register\n"); return (0); } rval = CSR_READ_2(sc, re8139_reg); if (sc->rl_type == RL_8139CPLUS && re8139_reg == RL_BMCR) { /* 8139C+ has different bit layout. */ rval &= ~(BMCR_LOOP | BMCR_ISO); } return (rval); } static int re_miibus_writereg(device_t dev, int phy, int reg, int data) { struct rl_softc *sc; u_int16_t re8139_reg = 0; int rval = 0; sc = device_get_softc(dev); if (sc->rl_type == RL_8169) { rval = re_gmii_writereg(dev, phy, reg, data); return (rval); } switch (reg) { case MII_BMCR: re8139_reg = RL_BMCR; if (sc->rl_type == RL_8139CPLUS) { /* 8139C+ has different bit layout. */ data &= ~(BMCR_LOOP | BMCR_ISO); } break; case MII_BMSR: re8139_reg = RL_BMSR; break; case MII_ANAR: re8139_reg = RL_ANAR; break; case MII_ANER: re8139_reg = RL_ANER; break; case MII_ANLPAR: re8139_reg = RL_LPAR; break; case MII_PHYIDR1: case MII_PHYIDR2: return (0); break; default: device_printf(sc->rl_dev, "bad phy register\n"); return (0); } CSR_WRITE_2(sc, re8139_reg, data); return (0); } static void re_miibus_statchg(device_t dev) { struct rl_softc *sc; struct ifnet *ifp; struct mii_data *mii; sc = device_get_softc(dev); mii = device_get_softc(sc->rl_miibus); ifp = sc->rl_ifp; if (mii == NULL || ifp == NULL || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; sc->rl_flags &= ~RL_FLAG_LINK; if ((mii->mii_media_status & (IFM_ACTIVE | IFM_AVALID)) == (IFM_ACTIVE | IFM_AVALID)) { switch (IFM_SUBTYPE(mii->mii_media_active)) { case IFM_10_T: case IFM_100_TX: sc->rl_flags |= RL_FLAG_LINK; break; case IFM_1000_T: if ((sc->rl_flags & RL_FLAG_FASTETHER) != 0) break; sc->rl_flags |= RL_FLAG_LINK; break; default: break; } } /* * RealTek controllers do not provide any interface to the RX/TX * MACs for resolved speed, duplex and flow-control parameters. */ } /* * Set the RX configuration and 64-bit multicast hash filter. */ static void re_set_rxmode(struct rl_softc *sc) { struct ifnet *ifp; struct ifmultiaddr *ifma; uint32_t hashes[2] = { 0, 0 }; uint32_t h, rxfilt; RL_LOCK_ASSERT(sc); ifp = sc->rl_ifp; rxfilt = RL_RXCFG_CONFIG | RL_RXCFG_RX_INDIV | RL_RXCFG_RX_BROAD; if ((sc->rl_flags & RL_FLAG_EARLYOFF) != 0) rxfilt |= RL_RXCFG_EARLYOFF; else if ((sc->rl_flags & RL_FLAG_8168G_PLUS) != 0) rxfilt |= RL_RXCFG_EARLYOFFV2; if (ifp->if_flags & (IFF_ALLMULTI | IFF_PROMISC)) { if (ifp->if_flags & IFF_PROMISC) rxfilt |= RL_RXCFG_RX_ALLPHYS; /* * Unlike other hardwares, we have to explicitly set * RL_RXCFG_RX_MULTI to receive multicast frames in * promiscuous mode. */ rxfilt |= RL_RXCFG_RX_MULTI; hashes[0] = hashes[1] = 0xffffffff; goto done; } if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; h = ether_crc32_be(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), ETHER_ADDR_LEN) >> 26; if (h < 32) hashes[0] |= (1 << h); else hashes[1] |= (1 << (h - 32)); } if_maddr_runlock(ifp); if (hashes[0] != 0 || hashes[1] != 0) { /* * For some unfathomable reason, RealTek decided to * reverse the order of the multicast hash registers * in the PCI Express parts. This means we have to * write the hash pattern in reverse order for those * devices. */ if ((sc->rl_flags & RL_FLAG_PCIE) != 0) { h = bswap32(hashes[0]); hashes[0] = bswap32(hashes[1]); hashes[1] = h; } rxfilt |= RL_RXCFG_RX_MULTI; } if (sc->rl_hwrev->rl_rev == RL_HWREV_8168F) { /* Disable multicast filtering due to silicon bug. */ hashes[0] = 0xffffffff; hashes[1] = 0xffffffff; } done: CSR_WRITE_4(sc, RL_MAR0, hashes[0]); CSR_WRITE_4(sc, RL_MAR4, hashes[1]); CSR_WRITE_4(sc, RL_RXCFG, rxfilt); } static void re_reset(struct rl_softc *sc) { int i; RL_LOCK_ASSERT(sc); CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_RESET); for (i = 0; i < RL_TIMEOUT; i++) { DELAY(10); if (!(CSR_READ_1(sc, RL_COMMAND) & RL_CMD_RESET)) break; } if (i == RL_TIMEOUT) device_printf(sc->rl_dev, "reset never completed!\n"); if ((sc->rl_flags & RL_FLAG_MACRESET) != 0) CSR_WRITE_1(sc, 0x82, 1); if (sc->rl_hwrev->rl_rev == RL_HWREV_8169S) re_gmii_writereg(sc->rl_dev, 1, 0x0b, 0); } #ifdef RE_DIAG /* * The following routine is designed to test for a defect on some * 32-bit 8169 cards. Some of these NICs have the REQ64# and ACK64# * lines connected to the bus, however for a 32-bit only card, they * should be pulled high. The result of this defect is that the * NIC will not work right if you plug it into a 64-bit slot: DMA * operations will be done with 64-bit transfers, which will fail * because the 64-bit data lines aren't connected. * * There's no way to work around this (short of talking a soldering * iron to the board), however we can detect it. The method we use * here is to put the NIC into digital loopback mode, set the receiver * to promiscuous mode, and then try to send a frame. We then compare * the frame data we sent to what was received. If the data matches, * then the NIC is working correctly, otherwise we know the user has * a defective NIC which has been mistakenly plugged into a 64-bit PCI * slot. In the latter case, there's no way the NIC can work correctly, * so we print out a message on the console and abort the device attach. */ static int re_diag(struct rl_softc *sc) { struct ifnet *ifp = sc->rl_ifp; struct mbuf *m0; struct ether_header *eh; struct rl_desc *cur_rx; u_int16_t status; u_int32_t rxstat; int total_len, i, error = 0, phyaddr; u_int8_t dst[] = { 0x00, 'h', 'e', 'l', 'l', 'o' }; u_int8_t src[] = { 0x00, 'w', 'o', 'r', 'l', 'd' }; /* Allocate a single mbuf */ MGETHDR(m0, M_NOWAIT, MT_DATA); if (m0 == NULL) return (ENOBUFS); RL_LOCK(sc); /* * Initialize the NIC in test mode. This sets the chip up * so that it can send and receive frames, but performs the * following special functions: * - Puts receiver in promiscuous mode * - Enables digital loopback mode * - Leaves interrupts turned off */ ifp->if_flags |= IFF_PROMISC; sc->rl_testmode = 1; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); sc->rl_flags |= RL_FLAG_LINK; if (sc->rl_type == RL_8169) phyaddr = 1; else phyaddr = 0; re_miibus_writereg(sc->rl_dev, phyaddr, MII_BMCR, BMCR_RESET); for (i = 0; i < RL_TIMEOUT; i++) { status = re_miibus_readreg(sc->rl_dev, phyaddr, MII_BMCR); if (!(status & BMCR_RESET)) break; } re_miibus_writereg(sc->rl_dev, phyaddr, MII_BMCR, BMCR_LOOP); CSR_WRITE_2(sc, RL_ISR, RL_INTRS); DELAY(100000); /* Put some data in the mbuf */ eh = mtod(m0, struct ether_header *); bcopy ((char *)&dst, eh->ether_dhost, ETHER_ADDR_LEN); bcopy ((char *)&src, eh->ether_shost, ETHER_ADDR_LEN); eh->ether_type = htons(ETHERTYPE_IP); m0->m_pkthdr.len = m0->m_len = ETHER_MIN_LEN - ETHER_CRC_LEN; /* * Queue the packet, start transmission. * Note: IF_HANDOFF() ultimately calls re_start() for us. */ CSR_WRITE_2(sc, RL_ISR, 0xFFFF); RL_UNLOCK(sc); /* XXX: re_diag must not be called when in ALTQ mode */ IF_HANDOFF(&ifp->if_snd, m0, ifp); RL_LOCK(sc); m0 = NULL; /* Wait for it to propagate through the chip */ DELAY(100000); for (i = 0; i < RL_TIMEOUT; i++) { status = CSR_READ_2(sc, RL_ISR); CSR_WRITE_2(sc, RL_ISR, status); if ((status & (RL_ISR_TIMEOUT_EXPIRED|RL_ISR_RX_OK)) == (RL_ISR_TIMEOUT_EXPIRED|RL_ISR_RX_OK)) break; DELAY(10); } if (i == RL_TIMEOUT) { device_printf(sc->rl_dev, "diagnostic failed, failed to receive packet in" " loopback mode\n"); error = EIO; goto done; } /* * The packet should have been dumped into the first * entry in the RX DMA ring. Grab it from there. */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, BUS_DMASYNC_POSTREAD); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_desc[0].rx_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_desc[0].rx_dmamap); m0 = sc->rl_ldata.rl_rx_desc[0].rx_m; sc->rl_ldata.rl_rx_desc[0].rx_m = NULL; eh = mtod(m0, struct ether_header *); cur_rx = &sc->rl_ldata.rl_rx_list[0]; total_len = RL_RXBYTES(cur_rx); rxstat = le32toh(cur_rx->rl_cmdstat); if (total_len != ETHER_MIN_LEN) { device_printf(sc->rl_dev, "diagnostic failed, received short packet\n"); error = EIO; goto done; } /* Test that the received packet data matches what we sent. */ if (bcmp((char *)&eh->ether_dhost, (char *)&dst, ETHER_ADDR_LEN) || bcmp((char *)&eh->ether_shost, (char *)&src, ETHER_ADDR_LEN) || ntohs(eh->ether_type) != ETHERTYPE_IP) { device_printf(sc->rl_dev, "WARNING, DMA FAILURE!\n"); device_printf(sc->rl_dev, "expected TX data: %6D/%6D/0x%x\n", dst, ":", src, ":", ETHERTYPE_IP); device_printf(sc->rl_dev, "received RX data: %6D/%6D/0x%x\n", eh->ether_dhost, ":", eh->ether_shost, ":", ntohs(eh->ether_type)); device_printf(sc->rl_dev, "You may have a defective 32-bit " "NIC plugged into a 64-bit PCI slot.\n"); device_printf(sc->rl_dev, "Please re-install the NIC in a " "32-bit slot for proper operation.\n"); device_printf(sc->rl_dev, "Read the re(4) man page for more " "details.\n"); error = EIO; } done: /* Turn interface off, release resources */ sc->rl_testmode = 0; sc->rl_flags &= ~RL_FLAG_LINK; ifp->if_flags &= ~IFF_PROMISC; re_stop(sc); if (m0 != NULL) m_freem(m0); RL_UNLOCK(sc); return (error); } #endif /* * Probe for a RealTek 8139C+/8169/8110 chip. Check the PCI vendor and device * IDs against our list and return a device name if we find a match. */ static int re_probe(device_t dev) { const struct rl_type *t; uint16_t devid, vendor; uint16_t revid, sdevid; int i; vendor = pci_get_vendor(dev); devid = pci_get_device(dev); revid = pci_get_revid(dev); sdevid = pci_get_subdevice(dev); if (vendor == LINKSYS_VENDORID && devid == LINKSYS_DEVICEID_EG1032) { if (sdevid != LINKSYS_SUBDEVICE_EG1032_REV3) { /* * Only attach to rev. 3 of the Linksys EG1032 adapter. * Rev. 2 is supported by sk(4). */ return (ENXIO); } } if (vendor == RT_VENDORID && devid == RT_DEVICEID_8139) { if (revid != 0x20) { /* 8139, let rl(4) take care of this device. */ return (ENXIO); } } t = re_devs; for (i = 0; i < nitems(re_devs); i++, t++) { if (vendor == t->rl_vid && devid == t->rl_did) { device_set_desc(dev, t->rl_name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } /* * Map a single buffer address. */ static void re_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *addr; if (error) return; KASSERT(nseg == 1, ("too many DMA segments, %d should be 1", nseg)); addr = arg; *addr = segs->ds_addr; } static int re_allocmem(device_t dev, struct rl_softc *sc) { bus_addr_t lowaddr; bus_size_t rx_list_size, tx_list_size; int error; int i; rx_list_size = sc->rl_ldata.rl_rx_desc_cnt * sizeof(struct rl_desc); tx_list_size = sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc); /* * Allocate the parent bus DMA tag appropriate for PCI. * In order to use DAC, RL_CPLUSCMD_PCI_DAC bit of RL_CPLUS_CMD * register should be set. However some RealTek chips are known * to be buggy on DAC handling, therefore disable DAC by limiting * DMA address space to 32bit. PCIe variants of RealTek chips * may not have the limitation. */ lowaddr = BUS_SPACE_MAXADDR; if ((sc->rl_flags & RL_FLAG_PCIE) == 0) lowaddr = BUS_SPACE_MAXADDR_32BIT; error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, lowaddr, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, 0, NULL, NULL, &sc->rl_parent_tag); if (error) { device_printf(dev, "could not allocate parent DMA tag\n"); return (error); } /* * Allocate map for TX mbufs. */ error = bus_dma_tag_create(sc->rl_parent_tag, 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES * RL_NTXSEGS, RL_NTXSEGS, 4096, 0, NULL, NULL, &sc->rl_ldata.rl_tx_mtag); if (error) { device_printf(dev, "could not allocate TX DMA tag\n"); return (error); } /* * Allocate map for RX mbufs. */ if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0) { error = bus_dma_tag_create(sc->rl_parent_tag, sizeof(uint64_t), 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MJUM9BYTES, 1, MJUM9BYTES, 0, NULL, NULL, &sc->rl_ldata.rl_jrx_mtag); if (error) { device_printf(dev, "could not allocate jumbo RX DMA tag\n"); return (error); } } error = bus_dma_tag_create(sc->rl_parent_tag, sizeof(uint64_t), 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, NULL, NULL, &sc->rl_ldata.rl_rx_mtag); if (error) { device_printf(dev, "could not allocate RX DMA tag\n"); return (error); } /* * Allocate map for TX descriptor list. */ error = bus_dma_tag_create(sc->rl_parent_tag, RL_RING_ALIGN, 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, tx_list_size, 1, tx_list_size, 0, NULL, NULL, &sc->rl_ldata.rl_tx_list_tag); if (error) { device_printf(dev, "could not allocate TX DMA ring tag\n"); return (error); } /* Allocate DMA'able memory for the TX ring */ error = bus_dmamem_alloc(sc->rl_ldata.rl_tx_list_tag, (void **)&sc->rl_ldata.rl_tx_list, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &sc->rl_ldata.rl_tx_list_map); if (error) { device_printf(dev, "could not allocate TX DMA ring\n"); return (error); } /* Load the map for the TX ring. */ sc->rl_ldata.rl_tx_list_addr = 0; error = bus_dmamap_load(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, sc->rl_ldata.rl_tx_list, tx_list_size, re_dma_map_addr, &sc->rl_ldata.rl_tx_list_addr, BUS_DMA_NOWAIT); if (error != 0 || sc->rl_ldata.rl_tx_list_addr == 0) { device_printf(dev, "could not load TX DMA ring\n"); return (ENOMEM); } /* Create DMA maps for TX buffers */ for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { error = bus_dmamap_create(sc->rl_ldata.rl_tx_mtag, 0, &sc->rl_ldata.rl_tx_desc[i].tx_dmamap); if (error) { device_printf(dev, "could not create DMA map for TX\n"); return (error); } } /* * Allocate map for RX descriptor list. */ error = bus_dma_tag_create(sc->rl_parent_tag, RL_RING_ALIGN, 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, rx_list_size, 1, rx_list_size, 0, NULL, NULL, &sc->rl_ldata.rl_rx_list_tag); if (error) { device_printf(dev, "could not create RX DMA ring tag\n"); return (error); } /* Allocate DMA'able memory for the RX ring */ error = bus_dmamem_alloc(sc->rl_ldata.rl_rx_list_tag, (void **)&sc->rl_ldata.rl_rx_list, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &sc->rl_ldata.rl_rx_list_map); if (error) { device_printf(dev, "could not allocate RX DMA ring\n"); return (error); } /* Load the map for the RX ring. */ sc->rl_ldata.rl_rx_list_addr = 0; error = bus_dmamap_load(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, sc->rl_ldata.rl_rx_list, rx_list_size, re_dma_map_addr, &sc->rl_ldata.rl_rx_list_addr, BUS_DMA_NOWAIT); if (error != 0 || sc->rl_ldata.rl_rx_list_addr == 0) { device_printf(dev, "could not load RX DMA ring\n"); return (ENOMEM); } /* Create DMA maps for RX buffers */ if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0) { error = bus_dmamap_create(sc->rl_ldata.rl_jrx_mtag, 0, &sc->rl_ldata.rl_jrx_sparemap); if (error) { device_printf(dev, "could not create spare DMA map for jumbo RX\n"); return (error); } for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { error = bus_dmamap_create(sc->rl_ldata.rl_jrx_mtag, 0, &sc->rl_ldata.rl_jrx_desc[i].rx_dmamap); if (error) { device_printf(dev, "could not create DMA map for jumbo RX\n"); return (error); } } } error = bus_dmamap_create(sc->rl_ldata.rl_rx_mtag, 0, &sc->rl_ldata.rl_rx_sparemap); if (error) { device_printf(dev, "could not create spare DMA map for RX\n"); return (error); } for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { error = bus_dmamap_create(sc->rl_ldata.rl_rx_mtag, 0, &sc->rl_ldata.rl_rx_desc[i].rx_dmamap); if (error) { device_printf(dev, "could not create DMA map for RX\n"); return (error); } } /* Create DMA map for statistics. */ error = bus_dma_tag_create(sc->rl_parent_tag, RL_DUMP_ALIGN, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, sizeof(struct rl_stats), 1, sizeof(struct rl_stats), 0, NULL, NULL, &sc->rl_ldata.rl_stag); if (error) { device_printf(dev, "could not create statistics DMA tag\n"); return (error); } /* Allocate DMA'able memory for statistics. */ error = bus_dmamem_alloc(sc->rl_ldata.rl_stag, (void **)&sc->rl_ldata.rl_stats, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &sc->rl_ldata.rl_smap); if (error) { device_printf(dev, "could not allocate statistics DMA memory\n"); return (error); } /* Load the map for statistics. */ sc->rl_ldata.rl_stats_addr = 0; error = bus_dmamap_load(sc->rl_ldata.rl_stag, sc->rl_ldata.rl_smap, sc->rl_ldata.rl_stats, sizeof(struct rl_stats), re_dma_map_addr, &sc->rl_ldata.rl_stats_addr, BUS_DMA_NOWAIT); if (error != 0 || sc->rl_ldata.rl_stats_addr == 0) { device_printf(dev, "could not load statistics DMA memory\n"); return (ENOMEM); } return (0); } /* * Attach the interface. Allocate softc structures, do ifmedia * setup and ethernet/BPF attach. */ static int re_attach(device_t dev) { u_char eaddr[ETHER_ADDR_LEN]; u_int16_t as[ETHER_ADDR_LEN / 2]; struct rl_softc *sc; struct ifnet *ifp; const struct rl_hwrev *hw_rev; int capmask, error = 0, hwrev, i, msic, msixc, phy, reg, rid; u_int32_t cap, ctl; u_int16_t devid, re_did = 0; uint8_t cfg; sc = device_get_softc(dev); sc->rl_dev = dev; mtx_init(&sc->rl_mtx, device_get_nameunit(dev), MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->rl_stat_callout, &sc->rl_mtx, 0); /* * Map control/status registers. */ pci_enable_busmaster(dev); devid = pci_get_device(dev); /* * Prefer memory space register mapping over IO space. * Because RTL8169SC does not seem to work when memory mapping * is used always activate io mapping. */ if (devid == RT_DEVICEID_8169SC) prefer_iomap = 1; if (prefer_iomap == 0) { sc->rl_res_id = PCIR_BAR(1); sc->rl_res_type = SYS_RES_MEMORY; /* RTL8168/8101E seems to use different BARs. */ if (devid == RT_DEVICEID_8168 || devid == RT_DEVICEID_8101E) sc->rl_res_id = PCIR_BAR(2); } else { sc->rl_res_id = PCIR_BAR(0); sc->rl_res_type = SYS_RES_IOPORT; } sc->rl_res = bus_alloc_resource_any(dev, sc->rl_res_type, &sc->rl_res_id, RF_ACTIVE); if (sc->rl_res == NULL && prefer_iomap == 0) { sc->rl_res_id = PCIR_BAR(0); sc->rl_res_type = SYS_RES_IOPORT; sc->rl_res = bus_alloc_resource_any(dev, sc->rl_res_type, &sc->rl_res_id, RF_ACTIVE); } if (sc->rl_res == NULL) { device_printf(dev, "couldn't map ports/memory\n"); error = ENXIO; goto fail; } sc->rl_btag = rman_get_bustag(sc->rl_res); sc->rl_bhandle = rman_get_bushandle(sc->rl_res); msic = pci_msi_count(dev); msixc = pci_msix_count(dev); if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { sc->rl_flags |= RL_FLAG_PCIE; sc->rl_expcap = reg; } if (bootverbose) { device_printf(dev, "MSI count : %d\n", msic); device_printf(dev, "MSI-X count : %d\n", msixc); } if (msix_disable > 0) msixc = 0; if (msi_disable > 0) msic = 0; /* Prefer MSI-X to MSI. */ if (msixc > 0) { msixc = RL_MSI_MESSAGES; rid = PCIR_BAR(4); sc->rl_res_pba = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->rl_res_pba == NULL) { device_printf(sc->rl_dev, "could not allocate MSI-X PBA resource\n"); } if (sc->rl_res_pba != NULL && pci_alloc_msix(dev, &msixc) == 0) { if (msixc == RL_MSI_MESSAGES) { device_printf(dev, "Using %d MSI-X message\n", msixc); sc->rl_flags |= RL_FLAG_MSIX; } else pci_release_msi(dev); } if ((sc->rl_flags & RL_FLAG_MSIX) == 0) { if (sc->rl_res_pba != NULL) bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->rl_res_pba); sc->rl_res_pba = NULL; msixc = 0; } } /* Prefer MSI to INTx. */ if (msixc == 0 && msic > 0) { msic = RL_MSI_MESSAGES; if (pci_alloc_msi(dev, &msic) == 0) { if (msic == RL_MSI_MESSAGES) { device_printf(dev, "Using %d MSI message\n", msic); sc->rl_flags |= RL_FLAG_MSI; /* Explicitly set MSI enable bit. */ CSR_WRITE_1(sc, RL_EECMD, RL_EE_MODE); cfg = CSR_READ_1(sc, RL_CFG2); cfg |= RL_CFG2_MSI; CSR_WRITE_1(sc, RL_CFG2, cfg); CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); } else pci_release_msi(dev); } if ((sc->rl_flags & RL_FLAG_MSI) == 0) msic = 0; } /* Allocate interrupt */ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) { rid = 0; sc->rl_irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE); if (sc->rl_irq[0] == NULL) { device_printf(dev, "couldn't allocate IRQ resources\n"); error = ENXIO; goto fail; } } else { for (i = 0, rid = 1; i < RL_MSI_MESSAGES; i++, rid++) { sc->rl_irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (sc->rl_irq[i] == NULL) { device_printf(dev, "couldn't allocate IRQ resources for " "message %d\n", rid); error = ENXIO; goto fail; } } } if ((sc->rl_flags & RL_FLAG_MSI) == 0) { CSR_WRITE_1(sc, RL_EECMD, RL_EE_MODE); cfg = CSR_READ_1(sc, RL_CFG2); if ((cfg & RL_CFG2_MSI) != 0) { device_printf(dev, "turning off MSI enable bit.\n"); cfg &= ~RL_CFG2_MSI; CSR_WRITE_1(sc, RL_CFG2, cfg); } CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); } - /* Disable ASPM L0S/L1. */ + /* Disable ASPM L0S/L1 and CLKREQ. */ if (sc->rl_expcap != 0) { cap = pci_read_config(dev, sc->rl_expcap + PCIER_LINK_CAP, 2); if ((cap & PCIEM_LINK_CAP_ASPM) != 0) { ctl = pci_read_config(dev, sc->rl_expcap + PCIER_LINK_CTL, 2); - if ((ctl & PCIEM_LINK_CTL_ASPMC) != 0) { - ctl &= ~PCIEM_LINK_CTL_ASPMC; + if ((ctl & (PCIEM_LINK_CTL_ECPM | + PCIEM_LINK_CTL_ASPMC))!= 0) { + ctl &= ~(PCIEM_LINK_CTL_ECPM | + PCIEM_LINK_CTL_ASPMC); pci_write_config(dev, sc->rl_expcap + PCIER_LINK_CTL, ctl, 2); device_printf(dev, "ASPM disabled\n"); } } else device_printf(dev, "no ASPM capability\n"); } hw_rev = re_hwrevs; hwrev = CSR_READ_4(sc, RL_TXCFG); switch (hwrev & 0x70000000) { case 0x00000000: case 0x10000000: device_printf(dev, "Chip rev. 0x%08x\n", hwrev & 0xfc800000); hwrev &= (RL_TXCFG_HWREV | 0x80000000); break; default: device_printf(dev, "Chip rev. 0x%08x\n", hwrev & 0x7c800000); sc->rl_macrev = hwrev & 0x00700000; hwrev &= RL_TXCFG_HWREV; break; } device_printf(dev, "MAC rev. 0x%08x\n", sc->rl_macrev); while (hw_rev->rl_desc != NULL) { if (hw_rev->rl_rev == hwrev) { sc->rl_type = hw_rev->rl_type; sc->rl_hwrev = hw_rev; break; } hw_rev++; } if (hw_rev->rl_desc == NULL) { device_printf(dev, "Unknown H/W revision: 0x%08x\n", hwrev); error = ENXIO; goto fail; } switch (hw_rev->rl_rev) { case RL_HWREV_8139CPLUS: sc->rl_flags |= RL_FLAG_FASTETHER | RL_FLAG_AUTOPAD; break; case RL_HWREV_8100E: case RL_HWREV_8101E: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_FASTETHER; break; case RL_HWREV_8102E: case RL_HWREV_8102EL: case RL_HWREV_8102EL_SPIN1: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_FASTETHER | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD; break; case RL_HWREV_8103E: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_FASTETHER | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_MACSLEEP; break; case RL_HWREV_8401E: case RL_HWREV_8105E: case RL_HWREV_8105E_SPIN1: case RL_HWREV_8106E: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PHYWAKE_PM | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_FASTETHER | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD; break; case RL_HWREV_8402: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PHYWAKE_PM | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_FASTETHER | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_CMDSTOP_WAIT_TXQ; break; case RL_HWREV_8168B_SPIN1: case RL_HWREV_8168B_SPIN2: sc->rl_flags |= RL_FLAG_WOLRXENB; /* FALLTHROUGH */ case RL_HWREV_8168B_SPIN3: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_MACSTAT; break; case RL_HWREV_8168C_SPIN2: sc->rl_flags |= RL_FLAG_MACSLEEP; /* FALLTHROUGH */ case RL_HWREV_8168C: if (sc->rl_macrev == 0x00200000) sc->rl_flags |= RL_FLAG_MACSLEEP; /* FALLTHROUGH */ case RL_HWREV_8168CP: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_WOL_MANLINK; break; case RL_HWREV_8168D: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PHYWAKE_PM | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_WOL_MANLINK; break; case RL_HWREV_8168DP: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_WAIT_TXPOLL | RL_FLAG_WOL_MANLINK; break; case RL_HWREV_8168E: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PHYWAKE_PM | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_WOL_MANLINK; break; case RL_HWREV_8168E_VL: case RL_HWREV_8168F: sc->rl_flags |= RL_FLAG_EARLYOFF; /* FALLTHROUGH */ case RL_HWREV_8411: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_CMDSTOP_WAIT_TXQ | RL_FLAG_WOL_MANLINK; break; case RL_HWREV_8168EP: case RL_HWREV_8168G: case RL_HWREV_8411B: sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_JUMBOV2 | RL_FLAG_CMDSTOP_WAIT_TXQ | RL_FLAG_WOL_MANLINK | RL_FLAG_8168G_PLUS; break; case RL_HWREV_8168GU: case RL_HWREV_8168H: if (pci_get_device(dev) == RT_DEVICEID_8101E) { /* RTL8106E(US), RTL8107E */ sc->rl_flags |= RL_FLAG_FASTETHER; } else sc->rl_flags |= RL_FLAG_JUMBOV2 | RL_FLAG_WOL_MANLINK; sc->rl_flags |= RL_FLAG_PHYWAKE | RL_FLAG_PAR | RL_FLAG_DESCV2 | RL_FLAG_MACSTAT | RL_FLAG_CMDSTOP | RL_FLAG_AUTOPAD | RL_FLAG_CMDSTOP_WAIT_TXQ | RL_FLAG_8168G_PLUS; break; case RL_HWREV_8169_8110SB: case RL_HWREV_8169_8110SBL: case RL_HWREV_8169_8110SC: case RL_HWREV_8169_8110SCE: sc->rl_flags |= RL_FLAG_PHYWAKE; /* FALLTHROUGH */ case RL_HWREV_8169: case RL_HWREV_8169S: case RL_HWREV_8110S: sc->rl_flags |= RL_FLAG_MACRESET; break; default: break; } if (sc->rl_hwrev->rl_rev == RL_HWREV_8139CPLUS) { sc->rl_cfg0 = RL_8139_CFG0; sc->rl_cfg1 = RL_8139_CFG1; sc->rl_cfg2 = 0; sc->rl_cfg3 = RL_8139_CFG3; sc->rl_cfg4 = RL_8139_CFG4; sc->rl_cfg5 = RL_8139_CFG5; } else { sc->rl_cfg0 = RL_CFG0; sc->rl_cfg1 = RL_CFG1; sc->rl_cfg2 = RL_CFG2; sc->rl_cfg3 = RL_CFG3; sc->rl_cfg4 = RL_CFG4; sc->rl_cfg5 = RL_CFG5; } /* Reset the adapter. */ RL_LOCK(sc); re_reset(sc); RL_UNLOCK(sc); /* Enable PME. */ CSR_WRITE_1(sc, RL_EECMD, RL_EE_MODE); cfg = CSR_READ_1(sc, sc->rl_cfg1); cfg |= RL_CFG1_PME; CSR_WRITE_1(sc, sc->rl_cfg1, cfg); cfg = CSR_READ_1(sc, sc->rl_cfg5); cfg &= RL_CFG5_PME_STS; CSR_WRITE_1(sc, sc->rl_cfg5, cfg); CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); if ((sc->rl_flags & RL_FLAG_PAR) != 0) { /* * XXX Should have a better way to extract station * address from EEPROM. */ for (i = 0; i < ETHER_ADDR_LEN; i++) eaddr[i] = CSR_READ_1(sc, RL_IDR0 + i); } else { sc->rl_eewidth = RL_9356_ADDR_LEN; re_read_eeprom(sc, (caddr_t)&re_did, 0, 1); if (re_did != 0x8129) sc->rl_eewidth = RL_9346_ADDR_LEN; /* * Get station address from the EEPROM. */ re_read_eeprom(sc, (caddr_t)as, RL_EE_EADDR, 3); for (i = 0; i < ETHER_ADDR_LEN / 2; i++) as[i] = le16toh(as[i]); bcopy(as, eaddr, ETHER_ADDR_LEN); } if (sc->rl_type == RL_8169) { /* Set RX length mask and number of descriptors. */ sc->rl_rxlenmask = RL_RDESC_STAT_GFRAGLEN; sc->rl_txstart = RL_GTXSTART; sc->rl_ldata.rl_tx_desc_cnt = RL_8169_TX_DESC_CNT; sc->rl_ldata.rl_rx_desc_cnt = RL_8169_RX_DESC_CNT; } else { /* Set RX length mask and number of descriptors. */ sc->rl_rxlenmask = RL_RDESC_STAT_FRAGLEN; sc->rl_txstart = RL_TXSTART; sc->rl_ldata.rl_tx_desc_cnt = RL_8139_TX_DESC_CNT; sc->rl_ldata.rl_rx_desc_cnt = RL_8139_RX_DESC_CNT; } error = re_allocmem(dev, sc); if (error) goto fail; re_add_sysctls(sc); ifp = sc->rl_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "can not if_alloc()\n"); error = ENOSPC; goto fail; } /* Take controller out of deep sleep mode. */ if ((sc->rl_flags & RL_FLAG_MACSLEEP) != 0) { if ((CSR_READ_1(sc, RL_MACDBG) & 0x80) == 0x80) CSR_WRITE_1(sc, RL_GPIO, CSR_READ_1(sc, RL_GPIO) | 0x01); else CSR_WRITE_1(sc, RL_GPIO, CSR_READ_1(sc, RL_GPIO) & ~0x01); } /* Take PHY out of power down mode. */ if ((sc->rl_flags & RL_FLAG_PHYWAKE_PM) != 0) { CSR_WRITE_1(sc, RL_PMCH, CSR_READ_1(sc, RL_PMCH) | 0x80); if (hw_rev->rl_rev == RL_HWREV_8401E) CSR_WRITE_1(sc, 0xD1, CSR_READ_1(sc, 0xD1) & ~0x08); } if ((sc->rl_flags & RL_FLAG_PHYWAKE) != 0) { re_gmii_writereg(dev, 1, 0x1f, 0); re_gmii_writereg(dev, 1, 0x0e, 0); } ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = re_ioctl; ifp->if_start = re_start; /* * RTL8168/8111C generates wrong IP checksummed frame if the * packet has IP options so disable TX checksum offloading. */ if (sc->rl_hwrev->rl_rev == RL_HWREV_8168C || sc->rl_hwrev->rl_rev == RL_HWREV_8168C_SPIN2 || sc->rl_hwrev->rl_rev == RL_HWREV_8168CP) { ifp->if_hwassist = 0; ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TSO4; } else { ifp->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; ifp->if_capabilities = IFCAP_HWCSUM | IFCAP_TSO4; } ifp->if_hwassist |= CSUM_TSO; ifp->if_capenable = ifp->if_capabilities; ifp->if_init = re_init; IFQ_SET_MAXLEN(&ifp->if_snd, RL_IFQ_MAXLEN); ifp->if_snd.ifq_drv_maxlen = RL_IFQ_MAXLEN; IFQ_SET_READY(&ifp->if_snd); TASK_INIT(&sc->rl_inttask, 0, re_int_task, sc); #define RE_PHYAD_INTERNAL 0 /* Do MII setup. */ phy = RE_PHYAD_INTERNAL; if (sc->rl_type == RL_8169) phy = 1; capmask = BMSR_DEFCAPMASK; if ((sc->rl_flags & RL_FLAG_FASTETHER) != 0) capmask &= ~BMSR_EXTSTAT; error = mii_attach(dev, &sc->rl_miibus, ifp, re_ifmedia_upd, re_ifmedia_sts, capmask, phy, MII_OFFSET_ANY, MIIF_DOPAUSE); if (error != 0) { device_printf(dev, "attaching PHYs failed\n"); goto fail; } /* * Call MI attach routine. */ ether_ifattach(ifp, eaddr); /* VLAN capability setup */ ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; if (ifp->if_capabilities & IFCAP_HWCSUM) ifp->if_capabilities |= IFCAP_VLAN_HWCSUM; /* Enable WOL if PM is supported. */ if (pci_find_cap(sc->rl_dev, PCIY_PMG, ®) == 0) ifp->if_capabilities |= IFCAP_WOL; ifp->if_capenable = ifp->if_capabilities; ifp->if_capenable &= ~(IFCAP_WOL_UCAST | IFCAP_WOL_MCAST); /* * Don't enable TSO by default. It is known to generate * corrupted TCP segments(bad TCP options) under certain * circumstances. */ ifp->if_hwassist &= ~CSUM_TSO; ifp->if_capenable &= ~(IFCAP_TSO4 | IFCAP_VLAN_HWTSO); #ifdef DEVICE_POLLING ifp->if_capabilities |= IFCAP_POLLING; #endif /* * Tell the upper layer(s) we support long frames. * Must appear after the call to ether_ifattach() because * ether_ifattach() sets ifi_hdrlen to the default value. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); #ifdef DEV_NETMAP re_netmap_attach(sc); #endif /* DEV_NETMAP */ #ifdef RE_DIAG /* * Perform hardware diagnostic on the original RTL8169. * Some 32-bit cards were incorrectly wired and would * malfunction if plugged into a 64-bit slot. */ if (hwrev == RL_HWREV_8169) { error = re_diag(sc); if (error) { device_printf(dev, "attach aborted due to hardware diag failure\n"); ether_ifdetach(ifp); goto fail; } } #endif #ifdef RE_TX_MODERATION intr_filter = 1; #endif /* Hook interrupt last to avoid having to lock softc */ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 && intr_filter == 0) { error = bus_setup_intr(dev, sc->rl_irq[0], INTR_TYPE_NET | INTR_MPSAFE, NULL, re_intr_msi, sc, &sc->rl_intrhand[0]); } else { error = bus_setup_intr(dev, sc->rl_irq[0], INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc, &sc->rl_intrhand[0]); } if (error) { device_printf(dev, "couldn't set up irq\n"); ether_ifdetach(ifp); } fail: if (error) re_detach(dev); return (error); } /* * Shutdown hardware and free up resources. This can be called any * time after the mutex has been initialized. It is called in both * the error case in attach and the normal detach case so it needs * to be careful about only freeing resources that have actually been * allocated. */ static int re_detach(device_t dev) { struct rl_softc *sc; struct ifnet *ifp; int i, rid; sc = device_get_softc(dev); ifp = sc->rl_ifp; KASSERT(mtx_initialized(&sc->rl_mtx), ("re mutex not initialized")); /* These should only be active if attach succeeded */ if (device_is_attached(dev)) { #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) ether_poll_deregister(ifp); #endif RL_LOCK(sc); #if 0 sc->suspended = 1; #endif re_stop(sc); RL_UNLOCK(sc); callout_drain(&sc->rl_stat_callout); taskqueue_drain(taskqueue_fast, &sc->rl_inttask); /* * Force off the IFF_UP flag here, in case someone * still had a BPF descriptor attached to this * interface. If they do, ether_ifdetach() will cause * the BPF code to try and clear the promisc mode * flag, which will bubble down to re_ioctl(), * which will try to call re_init() again. This will * turn the NIC back on and restart the MII ticker, * which will panic the system when the kernel tries * to invoke the re_tick() function that isn't there * anymore. */ ifp->if_flags &= ~IFF_UP; ether_ifdetach(ifp); } if (sc->rl_miibus) device_delete_child(dev, sc->rl_miibus); bus_generic_detach(dev); /* * The rest is resource deallocation, so we should already be * stopped here. */ if (sc->rl_intrhand[0] != NULL) { bus_teardown_intr(dev, sc->rl_irq[0], sc->rl_intrhand[0]); sc->rl_intrhand[0] = NULL; } if (ifp != NULL) { #ifdef DEV_NETMAP netmap_detach(ifp); #endif /* DEV_NETMAP */ if_free(ifp); } if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) rid = 0; else rid = 1; if (sc->rl_irq[0] != NULL) { bus_release_resource(dev, SYS_RES_IRQ, rid, sc->rl_irq[0]); sc->rl_irq[0] = NULL; } if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0) pci_release_msi(dev); if (sc->rl_res_pba) { rid = PCIR_BAR(4); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->rl_res_pba); } if (sc->rl_res) bus_release_resource(dev, sc->rl_res_type, sc->rl_res_id, sc->rl_res); /* Unload and free the RX DMA ring memory and map */ if (sc->rl_ldata.rl_rx_list_tag) { if (sc->rl_ldata.rl_rx_list_addr) bus_dmamap_unload(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map); if (sc->rl_ldata.rl_rx_list) bus_dmamem_free(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list, sc->rl_ldata.rl_rx_list_map); bus_dma_tag_destroy(sc->rl_ldata.rl_rx_list_tag); } /* Unload and free the TX DMA ring memory and map */ if (sc->rl_ldata.rl_tx_list_tag) { if (sc->rl_ldata.rl_tx_list_addr) bus_dmamap_unload(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map); if (sc->rl_ldata.rl_tx_list) bus_dmamem_free(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list, sc->rl_ldata.rl_tx_list_map); bus_dma_tag_destroy(sc->rl_ldata.rl_tx_list_tag); } /* Destroy all the RX and TX buffer maps */ if (sc->rl_ldata.rl_tx_mtag) { for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { if (sc->rl_ldata.rl_tx_desc[i].tx_dmamap) bus_dmamap_destroy(sc->rl_ldata.rl_tx_mtag, sc->rl_ldata.rl_tx_desc[i].tx_dmamap); } bus_dma_tag_destroy(sc->rl_ldata.rl_tx_mtag); } if (sc->rl_ldata.rl_rx_mtag) { for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { if (sc->rl_ldata.rl_rx_desc[i].rx_dmamap) bus_dmamap_destroy(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_desc[i].rx_dmamap); } if (sc->rl_ldata.rl_rx_sparemap) bus_dmamap_destroy(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_sparemap); bus_dma_tag_destroy(sc->rl_ldata.rl_rx_mtag); } if (sc->rl_ldata.rl_jrx_mtag) { for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { if (sc->rl_ldata.rl_jrx_desc[i].rx_dmamap) bus_dmamap_destroy(sc->rl_ldata.rl_jrx_mtag, sc->rl_ldata.rl_jrx_desc[i].rx_dmamap); } if (sc->rl_ldata.rl_jrx_sparemap) bus_dmamap_destroy(sc->rl_ldata.rl_jrx_mtag, sc->rl_ldata.rl_jrx_sparemap); bus_dma_tag_destroy(sc->rl_ldata.rl_jrx_mtag); } /* Unload and free the stats buffer and map */ if (sc->rl_ldata.rl_stag) { if (sc->rl_ldata.rl_stats_addr) bus_dmamap_unload(sc->rl_ldata.rl_stag, sc->rl_ldata.rl_smap); if (sc->rl_ldata.rl_stats) bus_dmamem_free(sc->rl_ldata.rl_stag, sc->rl_ldata.rl_stats, sc->rl_ldata.rl_smap); bus_dma_tag_destroy(sc->rl_ldata.rl_stag); } if (sc->rl_parent_tag) bus_dma_tag_destroy(sc->rl_parent_tag); mtx_destroy(&sc->rl_mtx); return (0); } static __inline void re_discard_rxbuf(struct rl_softc *sc, int idx) { struct rl_desc *desc; struct rl_rxdesc *rxd; uint32_t cmdstat; if (sc->rl_ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) rxd = &sc->rl_ldata.rl_jrx_desc[idx]; else rxd = &sc->rl_ldata.rl_rx_desc[idx]; desc = &sc->rl_ldata.rl_rx_list[idx]; desc->rl_vlanctl = 0; cmdstat = rxd->rx_size; if (idx == sc->rl_ldata.rl_rx_desc_cnt - 1) cmdstat |= RL_RDESC_CMD_EOR; desc->rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); } static int re_newbuf(struct rl_softc *sc, int idx) { struct mbuf *m; struct rl_rxdesc *rxd; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct rl_desc *desc; uint32_t cmdstat; int error, nsegs; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MCLBYTES; #ifdef RE_FIXUP_RX /* * This is part of an evil trick to deal with non-x86 platforms. * The RealTek chip requires RX buffers to be aligned on 64-bit * boundaries, but that will hose non-x86 machines. To get around * this, we leave some empty space at the start of each buffer * and for non-x86 hosts, we copy the buffer back six bytes * to achieve word alignment. This is slightly more efficient * than allocating a new buffer, copying the contents, and * discarding the old buffer. */ m_adj(m, RE_ETHER_ALIGN); #endif error = bus_dmamap_load_mbuf_sg(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_sparemap, m, segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { m_freem(m); return (ENOBUFS); } KASSERT(nsegs == 1, ("%s: %d segment returned!", __func__, nsegs)); rxd = &sc->rl_ldata.rl_rx_desc[idx]; if (rxd->rx_m != NULL) { bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd->rx_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rl_ldata.rl_rx_mtag, rxd->rx_dmamap); } rxd->rx_m = m; map = rxd->rx_dmamap; rxd->rx_dmamap = sc->rl_ldata.rl_rx_sparemap; rxd->rx_size = segs[0].ds_len; sc->rl_ldata.rl_rx_sparemap = map; bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd->rx_dmamap, BUS_DMASYNC_PREREAD); desc = &sc->rl_ldata.rl_rx_list[idx]; desc->rl_vlanctl = 0; desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(segs[0].ds_addr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(segs[0].ds_addr)); cmdstat = segs[0].ds_len; if (idx == sc->rl_ldata.rl_rx_desc_cnt - 1) cmdstat |= RL_RDESC_CMD_EOR; desc->rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); return (0); } static int re_jumbo_newbuf(struct rl_softc *sc, int idx) { struct mbuf *m; struct rl_rxdesc *rxd; bus_dma_segment_t segs[1]; bus_dmamap_t map; struct rl_desc *desc; uint32_t cmdstat; int error, nsegs; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUM9BYTES); if (m == NULL) return (ENOBUFS); m->m_len = m->m_pkthdr.len = MJUM9BYTES; #ifdef RE_FIXUP_RX m_adj(m, RE_ETHER_ALIGN); #endif error = bus_dmamap_load_mbuf_sg(sc->rl_ldata.rl_jrx_mtag, sc->rl_ldata.rl_jrx_sparemap, m, segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { m_freem(m); return (ENOBUFS); } KASSERT(nsegs == 1, ("%s: %d segment returned!", __func__, nsegs)); rxd = &sc->rl_ldata.rl_jrx_desc[idx]; if (rxd->rx_m != NULL) { bus_dmamap_sync(sc->rl_ldata.rl_jrx_mtag, rxd->rx_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rl_ldata.rl_jrx_mtag, rxd->rx_dmamap); } rxd->rx_m = m; map = rxd->rx_dmamap; rxd->rx_dmamap = sc->rl_ldata.rl_jrx_sparemap; rxd->rx_size = segs[0].ds_len; sc->rl_ldata.rl_jrx_sparemap = map; bus_dmamap_sync(sc->rl_ldata.rl_jrx_mtag, rxd->rx_dmamap, BUS_DMASYNC_PREREAD); desc = &sc->rl_ldata.rl_rx_list[idx]; desc->rl_vlanctl = 0; desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(segs[0].ds_addr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(segs[0].ds_addr)); cmdstat = segs[0].ds_len; if (idx == sc->rl_ldata.rl_rx_desc_cnt - 1) cmdstat |= RL_RDESC_CMD_EOR; desc->rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); return (0); } #ifdef RE_FIXUP_RX static __inline void re_fixup_rx(struct mbuf *m) { int i; uint16_t *src, *dst; src = mtod(m, uint16_t *); dst = src - (RE_ETHER_ALIGN - ETHER_ALIGN) / sizeof *src; for (i = 0; i < (m->m_len / sizeof(uint16_t) + 1); i++) *dst++ = *src++; m->m_data -= RE_ETHER_ALIGN - ETHER_ALIGN; } #endif static int re_tx_list_init(struct rl_softc *sc) { struct rl_desc *desc; int i; RL_LOCK_ASSERT(sc); bzero(sc->rl_ldata.rl_tx_list, sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc)); for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) sc->rl_ldata.rl_tx_desc[i].tx_m = NULL; #ifdef DEV_NETMAP re_netmap_tx_init(sc); #endif /* DEV_NETMAP */ /* Set EOR. */ desc = &sc->rl_ldata.rl_tx_list[sc->rl_ldata.rl_tx_desc_cnt - 1]; desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOR); bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); sc->rl_ldata.rl_tx_prodidx = 0; sc->rl_ldata.rl_tx_considx = 0; sc->rl_ldata.rl_tx_free = sc->rl_ldata.rl_tx_desc_cnt; return (0); } static int re_rx_list_init(struct rl_softc *sc) { int error, i; bzero(sc->rl_ldata.rl_rx_list, sc->rl_ldata.rl_rx_desc_cnt * sizeof(struct rl_desc)); for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { sc->rl_ldata.rl_rx_desc[i].rx_m = NULL; if ((error = re_newbuf(sc, i)) != 0) return (error); } #ifdef DEV_NETMAP re_netmap_rx_init(sc); #endif /* DEV_NETMAP */ /* Flush the RX descriptors */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); sc->rl_ldata.rl_rx_prodidx = 0; sc->rl_head = sc->rl_tail = NULL; sc->rl_int_rx_act = 0; return (0); } static int re_jrx_list_init(struct rl_softc *sc) { int error, i; bzero(sc->rl_ldata.rl_rx_list, sc->rl_ldata.rl_rx_desc_cnt * sizeof(struct rl_desc)); for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { sc->rl_ldata.rl_jrx_desc[i].rx_m = NULL; if ((error = re_jumbo_newbuf(sc, i)) != 0) return (error); } bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD); sc->rl_ldata.rl_rx_prodidx = 0; sc->rl_head = sc->rl_tail = NULL; sc->rl_int_rx_act = 0; return (0); } /* * RX handler for C+ and 8169. For the gigE chips, we support * the reception of jumbo frames that have been fragmented * across multiple 2K mbuf cluster buffers. */ static int re_rxeof(struct rl_softc *sc, int *rx_npktsp) { struct mbuf *m; struct ifnet *ifp; int i, rxerr, total_len; struct rl_desc *cur_rx; u_int32_t rxstat, rxvlan; int jumbo, maxpkt = 16, rx_npkts = 0; RL_LOCK_ASSERT(sc); ifp = sc->rl_ifp; #ifdef DEV_NETMAP if (netmap_rx_irq(ifp, 0, &rx_npkts)) return 0; #endif /* DEV_NETMAP */ if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) jumbo = 1; else jumbo = 0; /* Invalidate the descriptor memory */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); for (i = sc->rl_ldata.rl_rx_prodidx; maxpkt > 0; i = RL_RX_DESC_NXT(sc, i)) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; cur_rx = &sc->rl_ldata.rl_rx_list[i]; rxstat = le32toh(cur_rx->rl_cmdstat); if ((rxstat & RL_RDESC_STAT_OWN) != 0) break; total_len = rxstat & sc->rl_rxlenmask; rxvlan = le32toh(cur_rx->rl_vlanctl); if (jumbo != 0) m = sc->rl_ldata.rl_jrx_desc[i].rx_m; else m = sc->rl_ldata.rl_rx_desc[i].rx_m; if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0 && (rxstat & (RL_RDESC_STAT_SOF | RL_RDESC_STAT_EOF)) != (RL_RDESC_STAT_SOF | RL_RDESC_STAT_EOF)) { /* * RTL8168C or later controllers do not * support multi-fragment packet. */ re_discard_rxbuf(sc, i); continue; } else if ((rxstat & RL_RDESC_STAT_EOF) == 0) { if (re_newbuf(sc, i) != 0) { /* * If this is part of a multi-fragment packet, * discard all the pieces. */ if (sc->rl_head != NULL) { m_freem(sc->rl_head); sc->rl_head = sc->rl_tail = NULL; } re_discard_rxbuf(sc, i); continue; } m->m_len = RE_RX_DESC_BUFLEN; if (sc->rl_head == NULL) sc->rl_head = sc->rl_tail = m; else { m->m_flags &= ~M_PKTHDR; sc->rl_tail->m_next = m; sc->rl_tail = m; } continue; } /* * NOTE: for the 8139C+, the frame length field * is always 12 bits in size, but for the gigE chips, * it is 13 bits (since the max RX frame length is 16K). * Unfortunately, all 32 bits in the status word * were already used, so to make room for the extra * length bit, RealTek took out the 'frame alignment * error' bit and shifted the other status bits * over one slot. The OWN, EOR, FS and LS bits are * still in the same places. We have already extracted * the frame length and checked the OWN bit, so rather * than using an alternate bit mapping, we shift the * status bits one space to the right so we can evaluate * them using the 8169 status as though it was in the * same format as that of the 8139C+. */ if (sc->rl_type == RL_8169) rxstat >>= 1; /* * if total_len > 2^13-1, both _RXERRSUM and _GIANT will be * set, but if CRC is clear, it will still be a valid frame. */ if ((rxstat & RL_RDESC_STAT_RXERRSUM) != 0) { rxerr = 1; if ((sc->rl_flags & RL_FLAG_JUMBOV2) == 0 && total_len > 8191 && (rxstat & RL_RDESC_STAT_ERRS) == RL_RDESC_STAT_GIANT) rxerr = 0; if (rxerr != 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); /* * If this is part of a multi-fragment packet, * discard all the pieces. */ if (sc->rl_head != NULL) { m_freem(sc->rl_head); sc->rl_head = sc->rl_tail = NULL; } re_discard_rxbuf(sc, i); continue; } } /* * If allocating a replacement mbuf fails, * reload the current one. */ if (jumbo != 0) rxerr = re_jumbo_newbuf(sc, i); else rxerr = re_newbuf(sc, i); if (rxerr != 0) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (sc->rl_head != NULL) { m_freem(sc->rl_head); sc->rl_head = sc->rl_tail = NULL; } re_discard_rxbuf(sc, i); continue; } if (sc->rl_head != NULL) { if (jumbo != 0) m->m_len = total_len; else { m->m_len = total_len % RE_RX_DESC_BUFLEN; if (m->m_len == 0) m->m_len = RE_RX_DESC_BUFLEN; } /* * Special case: if there's 4 bytes or less * in this buffer, the mbuf can be discarded: * the last 4 bytes is the CRC, which we don't * care about anyway. */ if (m->m_len <= ETHER_CRC_LEN) { sc->rl_tail->m_len -= (ETHER_CRC_LEN - m->m_len); m_freem(m); } else { m->m_len -= ETHER_CRC_LEN; m->m_flags &= ~M_PKTHDR; sc->rl_tail->m_next = m; } m = sc->rl_head; sc->rl_head = sc->rl_tail = NULL; m->m_pkthdr.len = total_len - ETHER_CRC_LEN; } else m->m_pkthdr.len = m->m_len = (total_len - ETHER_CRC_LEN); #ifdef RE_FIXUP_RX re_fixup_rx(m); #endif if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); m->m_pkthdr.rcvif = ifp; /* Do RX checksumming if enabled */ if (ifp->if_capenable & IFCAP_RXCSUM) { if ((sc->rl_flags & RL_FLAG_DESCV2) == 0) { /* Check IP header checksum */ if (rxstat & RL_RDESC_STAT_PROTOID) m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if (!(rxstat & RL_RDESC_STAT_IPSUMBAD)) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; /* Check TCP/UDP checksum */ if ((RL_TCPPKT(rxstat) && !(rxstat & RL_RDESC_STAT_TCPSUMBAD)) || (RL_UDPPKT(rxstat) && !(rxstat & RL_RDESC_STAT_UDPSUMBAD))) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID|CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } } else { /* * RTL8168C/RTL816CP/RTL8111C/RTL8111CP */ if ((rxstat & RL_RDESC_STAT_PROTOID) && (rxvlan & RL_RDESC_IPV4)) m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; if (!(rxstat & RL_RDESC_STAT_IPSUMBAD) && (rxvlan & RL_RDESC_IPV4)) m->m_pkthdr.csum_flags |= CSUM_IP_VALID; if (((rxstat & RL_RDESC_STAT_TCP) && !(rxstat & RL_RDESC_STAT_TCPSUMBAD)) || ((rxstat & RL_RDESC_STAT_UDP) && !(rxstat & RL_RDESC_STAT_UDPSUMBAD))) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID|CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } } } maxpkt--; if (rxvlan & RL_RDESC_VLANCTL_TAG) { m->m_pkthdr.ether_vtag = bswap16((rxvlan & RL_RDESC_VLANCTL_DATA)); m->m_flags |= M_VLANTAG; } RL_UNLOCK(sc); (*ifp->if_input)(ifp, m); RL_LOCK(sc); rx_npkts++; } /* Flush the RX DMA ring */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); sc->rl_ldata.rl_rx_prodidx = i; if (rx_npktsp != NULL) *rx_npktsp = rx_npkts; if (maxpkt) return (EAGAIN); return (0); } static void re_txeof(struct rl_softc *sc) { struct ifnet *ifp; struct rl_txdesc *txd; u_int32_t txstat; int cons; cons = sc->rl_ldata.rl_tx_considx; if (cons == sc->rl_ldata.rl_tx_prodidx) return; ifp = sc->rl_ifp; #ifdef DEV_NETMAP if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ /* Invalidate the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); for (; cons != sc->rl_ldata.rl_tx_prodidx; cons = RL_TX_DESC_NXT(sc, cons)) { txstat = le32toh(sc->rl_ldata.rl_tx_list[cons].rl_cmdstat); if (txstat & RL_TDESC_STAT_OWN) break; /* * We only stash mbufs in the last descriptor * in a fragment chain, which also happens to * be the only place where the TX status bits * are valid. */ if (txstat & RL_TDESC_CMD_EOF) { txd = &sc->rl_ldata.rl_tx_desc[cons]; bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap); KASSERT(txd->tx_m != NULL, ("%s: freeing NULL mbufs!", __func__)); m_freem(txd->tx_m); txd->tx_m = NULL; if (txstat & (RL_TDESC_STAT_EXCESSCOL| RL_TDESC_STAT_COLCNT)) if_inc_counter(ifp, IFCOUNTER_COLLISIONS, 1); if (txstat & RL_TDESC_STAT_TXERRSUM) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); else if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } sc->rl_ldata.rl_tx_free++; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } sc->rl_ldata.rl_tx_considx = cons; /* No changes made to the TX ring, so no flush needed */ if (sc->rl_ldata.rl_tx_free != sc->rl_ldata.rl_tx_desc_cnt) { #ifdef RE_TX_MODERATION /* * If not all descriptors have been reaped yet, reload * the timer so that we will eventually get another * interrupt that will cause us to re-enter this routine. * This is done in case the transmitter has gone idle. */ CSR_WRITE_4(sc, RL_TIMERCNT, 1); #endif } else sc->rl_watchdog_timer = 0; } static void re_tick(void *xsc) { struct rl_softc *sc; struct mii_data *mii; sc = xsc; RL_LOCK_ASSERT(sc); mii = device_get_softc(sc->rl_miibus); mii_tick(mii); if ((sc->rl_flags & RL_FLAG_LINK) == 0) re_miibus_statchg(sc->rl_dev); /* * Reclaim transmitted frames here. Technically it is not * necessary to do here but it ensures periodic reclamation * regardless of Tx completion interrupt which seems to be * lost on PCIe based controllers under certain situations. */ re_txeof(sc); re_watchdog(sc); callout_reset(&sc->rl_stat_callout, hz, re_tick, sc); } #ifdef DEVICE_POLLING static int re_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct rl_softc *sc = ifp->if_softc; int rx_npkts = 0; RL_LOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rx_npkts = re_poll_locked(ifp, cmd, count); RL_UNLOCK(sc); return (rx_npkts); } static int re_poll_locked(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct rl_softc *sc = ifp->if_softc; int rx_npkts; RL_LOCK_ASSERT(sc); sc->rxcycles = count; re_rxeof(sc, &rx_npkts); re_txeof(sc); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) re_start_locked(ifp); if (cmd == POLL_AND_CHECK_STATUS) { /* also check status register */ u_int16_t status; status = CSR_READ_2(sc, RL_ISR); if (status == 0xffff) return (rx_npkts); if (status) CSR_WRITE_2(sc, RL_ISR, status); if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) && (sc->rl_flags & RL_FLAG_PCIE)) CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); /* * XXX check behaviour on receiver stalls. */ if (status & RL_ISR_SYSTEM_ERR) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); } } return (rx_npkts); } #endif /* DEVICE_POLLING */ static int re_intr(void *arg) { struct rl_softc *sc; uint16_t status; sc = arg; status = CSR_READ_2(sc, RL_ISR); if (status == 0xFFFF || (status & RL_INTRS_CPLUS) == 0) return (FILTER_STRAY); CSR_WRITE_2(sc, RL_IMR, 0); taskqueue_enqueue(taskqueue_fast, &sc->rl_inttask); return (FILTER_HANDLED); } static void re_int_task(void *arg, int npending) { struct rl_softc *sc; struct ifnet *ifp; u_int16_t status; int rval = 0; sc = arg; ifp = sc->rl_ifp; RL_LOCK(sc); status = CSR_READ_2(sc, RL_ISR); CSR_WRITE_2(sc, RL_ISR, status); if (sc->suspended || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { RL_UNLOCK(sc); return; } #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) { RL_UNLOCK(sc); return; } #endif if (status & (RL_ISR_RX_OK|RL_ISR_RX_ERR|RL_ISR_FIFO_OFLOW)) rval = re_rxeof(sc, NULL); /* * Some chips will ignore a second TX request issued * while an existing transmission is in progress. If * the transmitter goes idle but there are still * packets waiting to be sent, we need to restart the * channel here to flush them out. This only seems to * be required with the PCIe devices. */ if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) && (sc->rl_flags & RL_FLAG_PCIE)) CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); if (status & ( #ifdef RE_TX_MODERATION RL_ISR_TIMEOUT_EXPIRED| #else RL_ISR_TX_OK| #endif RL_ISR_TX_ERR|RL_ISR_TX_DESC_UNAVAIL)) re_txeof(sc); if (status & RL_ISR_SYSTEM_ERR) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); } if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) re_start_locked(ifp); RL_UNLOCK(sc); if ((CSR_READ_2(sc, RL_ISR) & RL_INTRS_CPLUS) || rval) { taskqueue_enqueue(taskqueue_fast, &sc->rl_inttask); return; } CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS); } static void re_intr_msi(void *xsc) { struct rl_softc *sc; struct ifnet *ifp; uint16_t intrs, status; sc = xsc; RL_LOCK(sc); ifp = sc->rl_ifp; #ifdef DEVICE_POLLING if (ifp->if_capenable & IFCAP_POLLING) { RL_UNLOCK(sc); return; } #endif /* Disable interrupts. */ CSR_WRITE_2(sc, RL_IMR, 0); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { RL_UNLOCK(sc); return; } intrs = RL_INTRS_CPLUS; status = CSR_READ_2(sc, RL_ISR); CSR_WRITE_2(sc, RL_ISR, status); if (sc->rl_int_rx_act > 0) { intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN); status &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN); } if (status & (RL_ISR_TIMEOUT_EXPIRED | RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) { re_rxeof(sc, NULL); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { if (sc->rl_int_rx_mod != 0 && (status & (RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) != 0) { /* Rearm one-shot timer. */ CSR_WRITE_4(sc, RL_TIMERCNT, 1); intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN); sc->rl_int_rx_act = 1; } else { intrs |= RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN; sc->rl_int_rx_act = 0; } } } /* * Some chips will ignore a second TX request issued * while an existing transmission is in progress. If * the transmitter goes idle but there are still * packets waiting to be sent, we need to restart the * channel here to flush them out. This only seems to * be required with the PCIe devices. */ if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) && (sc->rl_flags & RL_FLAG_PCIE)) CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); if (status & (RL_ISR_TX_OK | RL_ISR_TX_ERR | RL_ISR_TX_DESC_UNAVAIL)) re_txeof(sc); if (status & RL_ISR_SYSTEM_ERR) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) re_start_locked(ifp); CSR_WRITE_2(sc, RL_IMR, intrs); } RL_UNLOCK(sc); } static int re_encap(struct rl_softc *sc, struct mbuf **m_head) { struct rl_txdesc *txd, *txd_last; bus_dma_segment_t segs[RL_NTXSEGS]; bus_dmamap_t map; struct mbuf *m_new; struct rl_desc *desc; int nsegs, prod; int i, error, ei, si; int padlen; uint32_t cmdstat, csum_flags, vlanctl; RL_LOCK_ASSERT(sc); M_ASSERTPKTHDR((*m_head)); /* * With some of the RealTek chips, using the checksum offload * support in conjunction with the autopadding feature results * in the transmission of corrupt frames. For example, if we * need to send a really small IP fragment that's less than 60 * bytes in size, and IP header checksumming is enabled, the * resulting ethernet frame that appears on the wire will * have garbled payload. To work around this, if TX IP checksum * offload is enabled, we always manually pad short frames out * to the minimum ethernet frame size. */ if ((sc->rl_flags & RL_FLAG_AUTOPAD) == 0 && (*m_head)->m_pkthdr.len < RL_IP4CSUMTX_PADLEN && ((*m_head)->m_pkthdr.csum_flags & CSUM_IP) != 0) { padlen = RL_MIN_FRAMELEN - (*m_head)->m_pkthdr.len; if (M_WRITABLE(*m_head) == 0) { /* Get a writable copy. */ m_new = m_dup(*m_head, M_NOWAIT); m_freem(*m_head); if (m_new == NULL) { *m_head = NULL; return (ENOBUFS); } *m_head = m_new; } if ((*m_head)->m_next != NULL || M_TRAILINGSPACE(*m_head) < padlen) { m_new = m_defrag(*m_head, M_NOWAIT); if (m_new == NULL) { m_freem(*m_head); *m_head = NULL; return (ENOBUFS); } } else m_new = *m_head; /* * Manually pad short frames, and zero the pad space * to avoid leaking data. */ bzero(mtod(m_new, char *) + m_new->m_pkthdr.len, padlen); m_new->m_pkthdr.len += padlen; m_new->m_len = m_new->m_pkthdr.len; *m_head = m_new; } prod = sc->rl_ldata.rl_tx_prodidx; txd = &sc->rl_ldata.rl_tx_desc[prod]; error = bus_dmamap_load_mbuf_sg(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap, *m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { m_new = m_collapse(*m_head, M_NOWAIT, RL_NTXSEGS); if (m_new == NULL) { m_freem(*m_head); *m_head = NULL; return (ENOBUFS); } *m_head = m_new; error = bus_dmamap_load_mbuf_sg(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap, *m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { m_freem(*m_head); *m_head = NULL; return (error); } } else if (error != 0) return (error); if (nsegs == 0) { m_freem(*m_head); *m_head = NULL; return (EIO); } /* Check for number of available descriptors. */ if (sc->rl_ldata.rl_tx_free - nsegs <= 1) { bus_dmamap_unload(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap); return (ENOBUFS); } bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap, BUS_DMASYNC_PREWRITE); /* * Set up checksum offload. Note: checksum offload bits must * appear in all descriptors of a multi-descriptor transmit * attempt. This is according to testing done with an 8169 * chip. This is a requirement. */ vlanctl = 0; csum_flags = 0; if (((*m_head)->m_pkthdr.csum_flags & CSUM_TSO) != 0) { if ((sc->rl_flags & RL_FLAG_DESCV2) != 0) { csum_flags |= RL_TDESC_CMD_LGSEND; vlanctl |= ((uint32_t)(*m_head)->m_pkthdr.tso_segsz << RL_TDESC_CMD_MSSVALV2_SHIFT); } else { csum_flags |= RL_TDESC_CMD_LGSEND | ((uint32_t)(*m_head)->m_pkthdr.tso_segsz << RL_TDESC_CMD_MSSVAL_SHIFT); } } else { /* * Unconditionally enable IP checksum if TCP or UDP * checksum is required. Otherwise, TCP/UDP checksum * doesn't make effects. */ if (((*m_head)->m_pkthdr.csum_flags & RE_CSUM_FEATURES) != 0) { if ((sc->rl_flags & RL_FLAG_DESCV2) == 0) { csum_flags |= RL_TDESC_CMD_IPCSUM; if (((*m_head)->m_pkthdr.csum_flags & CSUM_TCP) != 0) csum_flags |= RL_TDESC_CMD_TCPCSUM; if (((*m_head)->m_pkthdr.csum_flags & CSUM_UDP) != 0) csum_flags |= RL_TDESC_CMD_UDPCSUM; } else { vlanctl |= RL_TDESC_CMD_IPCSUMV2; if (((*m_head)->m_pkthdr.csum_flags & CSUM_TCP) != 0) vlanctl |= RL_TDESC_CMD_TCPCSUMV2; if (((*m_head)->m_pkthdr.csum_flags & CSUM_UDP) != 0) vlanctl |= RL_TDESC_CMD_UDPCSUMV2; } } } /* * Set up hardware VLAN tagging. Note: vlan tag info must * appear in all descriptors of a multi-descriptor * transmission attempt. */ if ((*m_head)->m_flags & M_VLANTAG) vlanctl |= bswap16((*m_head)->m_pkthdr.ether_vtag) | RL_TDESC_VLANCTL_TAG; si = prod; for (i = 0; i < nsegs; i++, prod = RL_TX_DESC_NXT(sc, prod)) { desc = &sc->rl_ldata.rl_tx_list[prod]; desc->rl_vlanctl = htole32(vlanctl); desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(segs[i].ds_addr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(segs[i].ds_addr)); cmdstat = segs[i].ds_len; if (i != 0) cmdstat |= RL_TDESC_CMD_OWN; if (prod == sc->rl_ldata.rl_tx_desc_cnt - 1) cmdstat |= RL_TDESC_CMD_EOR; desc->rl_cmdstat = htole32(cmdstat | csum_flags); sc->rl_ldata.rl_tx_free--; } /* Update producer index. */ sc->rl_ldata.rl_tx_prodidx = prod; /* Set EOF on the last descriptor. */ ei = RL_TX_DESC_PRV(sc, prod); desc = &sc->rl_ldata.rl_tx_list[ei]; desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOF); desc = &sc->rl_ldata.rl_tx_list[si]; /* Set SOF and transfer ownership of packet to the chip. */ desc->rl_cmdstat |= htole32(RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF); /* * Insure that the map for this transmission * is placed at the array index of the last descriptor * in this chain. (Swap last and first dmamaps.) */ txd_last = &sc->rl_ldata.rl_tx_desc[ei]; map = txd->tx_dmamap; txd->tx_dmamap = txd_last->tx_dmamap; txd_last->tx_dmamap = map; txd_last->tx_m = *m_head; return (0); } static void re_start(struct ifnet *ifp) { struct rl_softc *sc; sc = ifp->if_softc; RL_LOCK(sc); re_start_locked(ifp); RL_UNLOCK(sc); } /* * Main transmit routine for C+ and gigE NICs. */ static void re_start_locked(struct ifnet *ifp) { struct rl_softc *sc; struct mbuf *m_head; int queued; sc = ifp->if_softc; #ifdef DEV_NETMAP /* XXX is this necessary ? */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_kring *kring = &NA(ifp)->tx_rings[0]; if (sc->rl_ldata.rl_tx_prodidx != kring->nr_hwcur) { /* kick the tx unit */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); #ifdef RE_TX_MODERATION CSR_WRITE_4(sc, RL_TIMERCNT, 1); #endif sc->rl_watchdog_timer = 5; } return; } #endif /* DEV_NETMAP */ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING || (sc->rl_flags & RL_FLAG_LINK) == 0) return; for (queued = 0; !IFQ_DRV_IS_EMPTY(&ifp->if_snd) && sc->rl_ldata.rl_tx_free > 1;) { IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (re_encap(sc, &m_head) != 0) { if (m_head == NULL) break; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } /* * If there's a BPF listener, bounce a copy of this frame * to him. */ ETHER_BPF_MTAP(ifp, m_head); queued++; } if (queued == 0) { #ifdef RE_TX_MODERATION if (sc->rl_ldata.rl_tx_free != sc->rl_ldata.rl_tx_desc_cnt) CSR_WRITE_4(sc, RL_TIMERCNT, 1); #endif return; } /* Flush the TX descriptors */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); #ifdef RE_TX_MODERATION /* * Use the countdown timer for interrupt moderation. * 'TX done' interrupts are disabled. Instead, we reset the * countdown timer, which will begin counting until it hits * the value in the TIMERINT register, and then trigger an * interrupt. Each time we write to the TIMERCNT register, * the timer count is reset to 0. */ CSR_WRITE_4(sc, RL_TIMERCNT, 1); #endif /* * Set a timeout in case the chip goes out to lunch. */ sc->rl_watchdog_timer = 5; } static void re_set_jumbo(struct rl_softc *sc, int jumbo) { if (sc->rl_hwrev->rl_rev == RL_HWREV_8168E_VL) { pci_set_max_read_req(sc->rl_dev, 4096); return; } CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_WRITECFG); if (jumbo != 0) { CSR_WRITE_1(sc, sc->rl_cfg3, CSR_READ_1(sc, sc->rl_cfg3) | RL_CFG3_JUMBO_EN0); switch (sc->rl_hwrev->rl_rev) { case RL_HWREV_8168DP: break; case RL_HWREV_8168E: CSR_WRITE_1(sc, sc->rl_cfg4, CSR_READ_1(sc, sc->rl_cfg4) | 0x01); break; default: CSR_WRITE_1(sc, sc->rl_cfg4, CSR_READ_1(sc, sc->rl_cfg4) | RL_CFG4_JUMBO_EN1); } } else { CSR_WRITE_1(sc, sc->rl_cfg3, CSR_READ_1(sc, sc->rl_cfg3) & ~RL_CFG3_JUMBO_EN0); switch (sc->rl_hwrev->rl_rev) { case RL_HWREV_8168DP: break; case RL_HWREV_8168E: CSR_WRITE_1(sc, sc->rl_cfg4, CSR_READ_1(sc, sc->rl_cfg4) & ~0x01); break; default: CSR_WRITE_1(sc, sc->rl_cfg4, CSR_READ_1(sc, sc->rl_cfg4) & ~RL_CFG4_JUMBO_EN1); } } CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); switch (sc->rl_hwrev->rl_rev) { case RL_HWREV_8168DP: pci_set_max_read_req(sc->rl_dev, 4096); break; default: if (jumbo != 0) pci_set_max_read_req(sc->rl_dev, 512); else pci_set_max_read_req(sc->rl_dev, 4096); } } static void re_init(void *xsc) { struct rl_softc *sc = xsc; RL_LOCK(sc); re_init_locked(sc); RL_UNLOCK(sc); } static void re_init_locked(struct rl_softc *sc) { struct ifnet *ifp = sc->rl_ifp; struct mii_data *mii; uint32_t reg; uint16_t cfg; union { uint32_t align_dummy; u_char eaddr[ETHER_ADDR_LEN]; } eaddr; RL_LOCK_ASSERT(sc); mii = device_get_softc(sc->rl_miibus); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) return; /* * Cancel pending I/O and free all RX/TX buffers. */ re_stop(sc); /* Put controller into known state. */ re_reset(sc); /* * For C+ mode, initialize the RX descriptors and mbufs. */ if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0) { if (ifp->if_mtu > RL_MTU) { if (re_jrx_list_init(sc) != 0) { device_printf(sc->rl_dev, "no memory for jumbo RX buffers\n"); re_stop(sc); return; } /* Disable checksum offloading for jumbo frames. */ ifp->if_capenable &= ~(IFCAP_HWCSUM | IFCAP_TSO4); ifp->if_hwassist &= ~(RE_CSUM_FEATURES | CSUM_TSO); } else { if (re_rx_list_init(sc) != 0) { device_printf(sc->rl_dev, "no memory for RX buffers\n"); re_stop(sc); return; } } re_set_jumbo(sc, ifp->if_mtu > RL_MTU); } else { if (re_rx_list_init(sc) != 0) { device_printf(sc->rl_dev, "no memory for RX buffers\n"); re_stop(sc); return; } if ((sc->rl_flags & RL_FLAG_PCIE) != 0 && pci_get_device(sc->rl_dev) != RT_DEVICEID_8101E) { if (ifp->if_mtu > RL_MTU) pci_set_max_read_req(sc->rl_dev, 512); else pci_set_max_read_req(sc->rl_dev, 4096); } } re_tx_list_init(sc); /* * Enable C+ RX and TX mode, as well as VLAN stripping and * RX checksum offload. We must configure the C+ register * before all others. */ cfg = RL_CPLUSCMD_PCI_MRW; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) cfg |= RL_CPLUSCMD_RXCSUM_ENB; if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) cfg |= RL_CPLUSCMD_VLANSTRIP; if ((sc->rl_flags & RL_FLAG_MACSTAT) != 0) { cfg |= RL_CPLUSCMD_MACSTAT_DIS; /* XXX magic. */ cfg |= 0x0001; } else cfg |= RL_CPLUSCMD_RXENB | RL_CPLUSCMD_TXENB; CSR_WRITE_2(sc, RL_CPLUS_CMD, cfg); if (sc->rl_hwrev->rl_rev == RL_HWREV_8169_8110SC || sc->rl_hwrev->rl_rev == RL_HWREV_8169_8110SCE) { reg = 0x000fff00; if ((CSR_READ_1(sc, sc->rl_cfg2) & RL_CFG2_PCI66MHZ) != 0) reg |= 0x000000ff; if (sc->rl_hwrev->rl_rev == RL_HWREV_8169_8110SCE) reg |= 0x00f00000; CSR_WRITE_4(sc, 0x7c, reg); /* Disable interrupt mitigation. */ CSR_WRITE_2(sc, 0xe2, 0); } /* * Disable TSO if interface MTU size is greater than MSS * allowed in controller. */ if (ifp->if_mtu > RL_TSO_MTU && (ifp->if_capenable & IFCAP_TSO4) != 0) { ifp->if_capenable &= ~IFCAP_TSO4; ifp->if_hwassist &= ~CSUM_TSO; } /* * Init our MAC address. Even though the chipset * documentation doesn't mention it, we need to enter "Config * register write enable" mode to modify the ID registers. */ /* Copy MAC address on stack to align. */ bcopy(IF_LLADDR(ifp), eaddr.eaddr, ETHER_ADDR_LEN); CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_WRITECFG); CSR_WRITE_4(sc, RL_IDR0, htole32(*(u_int32_t *)(&eaddr.eaddr[0]))); CSR_WRITE_4(sc, RL_IDR4, htole32(*(u_int32_t *)(&eaddr.eaddr[4]))); CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); /* * Load the addresses of the RX and TX lists into the chip. */ CSR_WRITE_4(sc, RL_RXLIST_ADDR_HI, RL_ADDR_HI(sc->rl_ldata.rl_rx_list_addr)); CSR_WRITE_4(sc, RL_RXLIST_ADDR_LO, RL_ADDR_LO(sc->rl_ldata.rl_rx_list_addr)); CSR_WRITE_4(sc, RL_TXLIST_ADDR_HI, RL_ADDR_HI(sc->rl_ldata.rl_tx_list_addr)); CSR_WRITE_4(sc, RL_TXLIST_ADDR_LO, RL_ADDR_LO(sc->rl_ldata.rl_tx_list_addr)); if ((sc->rl_flags & RL_FLAG_8168G_PLUS) != 0) { /* Disable RXDV gate. */ CSR_WRITE_4(sc, RL_MISC, CSR_READ_4(sc, RL_MISC) & ~0x00080000); } /* * Enable transmit and receive for pre-RTL8168G controllers. * RX/TX MACs should be enabled before RX/TX configuration. */ if ((sc->rl_flags & RL_FLAG_8168G_PLUS) == 0) CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_TX_ENB | RL_CMD_RX_ENB); /* * Set the initial TX configuration. */ if (sc->rl_testmode) { if (sc->rl_type == RL_8169) CSR_WRITE_4(sc, RL_TXCFG, RL_TXCFG_CONFIG|RL_LOOPTEST_ON); else CSR_WRITE_4(sc, RL_TXCFG, RL_TXCFG_CONFIG|RL_LOOPTEST_ON_CPLUS); } else CSR_WRITE_4(sc, RL_TXCFG, RL_TXCFG_CONFIG); CSR_WRITE_1(sc, RL_EARLY_TX_THRESH, 16); /* * Set the initial RX configuration. */ re_set_rxmode(sc); /* Configure interrupt moderation. */ if (sc->rl_type == RL_8169) { /* Magic from vendor. */ CSR_WRITE_2(sc, RL_INTRMOD, 0x5100); } /* * Enable transmit and receive for RTL8168G and later controllers. * RX/TX MACs should be enabled after RX/TX configuration. */ if ((sc->rl_flags & RL_FLAG_8168G_PLUS) != 0) CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_TX_ENB | RL_CMD_RX_ENB); #ifdef DEVICE_POLLING /* * Disable interrupts if we are polling. */ if (ifp->if_capenable & IFCAP_POLLING) CSR_WRITE_2(sc, RL_IMR, 0); else /* otherwise ... */ #endif /* * Enable interrupts. */ if (sc->rl_testmode) CSR_WRITE_2(sc, RL_IMR, 0); else CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS); CSR_WRITE_2(sc, RL_ISR, RL_INTRS_CPLUS); /* Set initial TX threshold */ sc->rl_txthresh = RL_TX_THRESH_INIT; /* Start RX/TX process. */ CSR_WRITE_4(sc, RL_MISSEDPKT, 0); /* * Initialize the timer interrupt register so that * a timer interrupt will be generated once the timer * reaches a certain number of ticks. The timer is * reloaded on each transmit. */ #ifdef RE_TX_MODERATION /* * Use timer interrupt register to moderate TX interrupt * moderation, which dramatically improves TX frame rate. */ if (sc->rl_type == RL_8169) CSR_WRITE_4(sc, RL_TIMERINT_8169, 0x800); else CSR_WRITE_4(sc, RL_TIMERINT, 0x400); #else /* * Use timer interrupt register to moderate RX interrupt * moderation. */ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 && intr_filter == 0) { if (sc->rl_type == RL_8169) CSR_WRITE_4(sc, RL_TIMERINT_8169, RL_USECS(sc->rl_int_rx_mod)); } else { if (sc->rl_type == RL_8169) CSR_WRITE_4(sc, RL_TIMERINT_8169, RL_USECS(0)); } #endif /* * For 8169 gigE NICs, set the max allowed RX packet * size so we can receive jumbo frames. */ if (sc->rl_type == RL_8169) { if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0) { /* * For controllers that use new jumbo frame scheme, * set maximum size of jumbo frame depending on * controller revisions. */ if (ifp->if_mtu > RL_MTU) CSR_WRITE_2(sc, RL_MAXRXPKTLEN, sc->rl_hwrev->rl_max_mtu + ETHER_VLAN_ENCAP_LEN + ETHER_HDR_LEN + ETHER_CRC_LEN); else CSR_WRITE_2(sc, RL_MAXRXPKTLEN, RE_RX_DESC_BUFLEN); } else if ((sc->rl_flags & RL_FLAG_PCIE) != 0 && sc->rl_hwrev->rl_max_mtu == RL_MTU) { /* RTL810x has no jumbo frame support. */ CSR_WRITE_2(sc, RL_MAXRXPKTLEN, RE_RX_DESC_BUFLEN); } else CSR_WRITE_2(sc, RL_MAXRXPKTLEN, 16383); } if (sc->rl_testmode) return; CSR_WRITE_1(sc, sc->rl_cfg1, CSR_READ_1(sc, sc->rl_cfg1) | RL_CFG1_DRVLOAD); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; sc->rl_flags &= ~RL_FLAG_LINK; mii_mediachg(mii); sc->rl_watchdog_timer = 0; callout_reset(&sc->rl_stat_callout, hz, re_tick, sc); } /* * Set media options. */ static int re_ifmedia_upd(struct ifnet *ifp) { struct rl_softc *sc; struct mii_data *mii; int error; sc = ifp->if_softc; mii = device_get_softc(sc->rl_miibus); RL_LOCK(sc); error = mii_mediachg(mii); RL_UNLOCK(sc); return (error); } /* * Report current media status. */ static void re_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct rl_softc *sc; struct mii_data *mii; sc = ifp->if_softc; mii = device_get_softc(sc->rl_miibus); RL_LOCK(sc); mii_pollstat(mii); ifmr->ifm_active = mii->mii_media_active; ifmr->ifm_status = mii->mii_media_status; RL_UNLOCK(sc); } static int re_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct rl_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; struct mii_data *mii; int error = 0; switch (command) { case SIOCSIFMTU: if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > sc->rl_hwrev->rl_max_mtu || ((sc->rl_flags & RL_FLAG_FASTETHER) != 0 && ifr->ifr_mtu > RL_MTU)) { error = EINVAL; break; } RL_LOCK(sc); if (ifp->if_mtu != ifr->ifr_mtu) { ifp->if_mtu = ifr->ifr_mtu; if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0 && (ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); } if (ifp->if_mtu > RL_TSO_MTU && (ifp->if_capenable & IFCAP_TSO4) != 0) { ifp->if_capenable &= ~(IFCAP_TSO4 | IFCAP_VLAN_HWTSO); ifp->if_hwassist &= ~CSUM_TSO; } VLAN_CAPABILITIES(ifp); } RL_UNLOCK(sc); break; case SIOCSIFFLAGS: RL_LOCK(sc); if ((ifp->if_flags & IFF_UP) != 0) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { if (((ifp->if_flags ^ sc->rl_if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) != 0) re_set_rxmode(sc); } else re_init_locked(sc); } else { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) re_stop(sc); } sc->rl_if_flags = ifp->if_flags; RL_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: RL_LOCK(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) re_set_rxmode(sc); RL_UNLOCK(sc); break; case SIOCGIFMEDIA: case SIOCSIFMEDIA: mii = device_get_softc(sc->rl_miibus); error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, command); break; case SIOCSIFCAP: { int mask, reinit; mask = ifr->ifr_reqcap ^ ifp->if_capenable; reinit = 0; #ifdef DEVICE_POLLING if (mask & IFCAP_POLLING) { if (ifr->ifr_reqcap & IFCAP_POLLING) { error = ether_poll_register(re_poll, ifp); if (error) return (error); RL_LOCK(sc); /* Disable interrupts */ CSR_WRITE_2(sc, RL_IMR, 0x0000); ifp->if_capenable |= IFCAP_POLLING; RL_UNLOCK(sc); } else { error = ether_poll_deregister(ifp); /* Enable interrupts. */ RL_LOCK(sc); CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS); ifp->if_capenable &= ~IFCAP_POLLING; RL_UNLOCK(sc); } } #endif /* DEVICE_POLLING */ RL_LOCK(sc); if ((mask & IFCAP_TXCSUM) != 0 && (ifp->if_capabilities & IFCAP_TXCSUM) != 0) { ifp->if_capenable ^= IFCAP_TXCSUM; if ((ifp->if_capenable & IFCAP_TXCSUM) != 0) ifp->if_hwassist |= RE_CSUM_FEATURES; else ifp->if_hwassist &= ~RE_CSUM_FEATURES; reinit = 1; } if ((mask & IFCAP_RXCSUM) != 0 && (ifp->if_capabilities & IFCAP_RXCSUM) != 0) { ifp->if_capenable ^= IFCAP_RXCSUM; reinit = 1; } if ((mask & IFCAP_TSO4) != 0 && (ifp->if_capabilities & IFCAP_TSO4) != 0) { ifp->if_capenable ^= IFCAP_TSO4; if ((IFCAP_TSO4 & ifp->if_capenable) != 0) ifp->if_hwassist |= CSUM_TSO; else ifp->if_hwassist &= ~CSUM_TSO; if (ifp->if_mtu > RL_TSO_MTU && (ifp->if_capenable & IFCAP_TSO4) != 0) { ifp->if_capenable &= ~IFCAP_TSO4; ifp->if_hwassist &= ~CSUM_TSO; } } if ((mask & IFCAP_VLAN_HWTSO) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWTSO) != 0) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if ((mask & IFCAP_VLAN_HWTAGGING) != 0 && (ifp->if_capabilities & IFCAP_VLAN_HWTAGGING) != 0) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; /* TSO over VLAN requires VLAN hardware tagging. */ if ((ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) ifp->if_capenable &= ~IFCAP_VLAN_HWTSO; reinit = 1; } if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0 && (mask & (IFCAP_HWCSUM | IFCAP_TSO4 | IFCAP_VLAN_HWTSO)) != 0) reinit = 1; if ((mask & IFCAP_WOL) != 0 && (ifp->if_capabilities & IFCAP_WOL) != 0) { if ((mask & IFCAP_WOL_UCAST) != 0) ifp->if_capenable ^= IFCAP_WOL_UCAST; if ((mask & IFCAP_WOL_MCAST) != 0) ifp->if_capenable ^= IFCAP_WOL_MCAST; if ((mask & IFCAP_WOL_MAGIC) != 0) ifp->if_capenable ^= IFCAP_WOL_MAGIC; } if (reinit && ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); } RL_UNLOCK(sc); VLAN_CAPABILITIES(ifp); } break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static void re_watchdog(struct rl_softc *sc) { struct ifnet *ifp; RL_LOCK_ASSERT(sc); if (sc->rl_watchdog_timer == 0 || --sc->rl_watchdog_timer != 0) return; ifp = sc->rl_ifp; re_txeof(sc); if (sc->rl_ldata.rl_tx_free == sc->rl_ldata.rl_tx_desc_cnt) { if_printf(ifp, "watchdog timeout (missed Tx interrupts) " "-- recovering\n"); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) re_start_locked(ifp); return; } if_printf(ifp, "watchdog timeout\n"); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); re_rxeof(sc, NULL); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; re_init_locked(sc); if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) re_start_locked(ifp); } /* * Stop the adapter and free any mbufs allocated to the * RX and TX lists. */ static void re_stop(struct rl_softc *sc) { int i; struct ifnet *ifp; struct rl_txdesc *txd; struct rl_rxdesc *rxd; RL_LOCK_ASSERT(sc); ifp = sc->rl_ifp; sc->rl_watchdog_timer = 0; callout_stop(&sc->rl_stat_callout); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); /* * Disable accepting frames to put RX MAC into idle state. * Otherwise it's possible to get frames while stop command * execution is in progress and controller can DMA the frame * to already freed RX buffer during that period. */ CSR_WRITE_4(sc, RL_RXCFG, CSR_READ_4(sc, RL_RXCFG) & ~(RL_RXCFG_RX_ALLPHYS | RL_RXCFG_RX_INDIV | RL_RXCFG_RX_MULTI | RL_RXCFG_RX_BROAD)); if ((sc->rl_flags & RL_FLAG_8168G_PLUS) != 0) { /* Enable RXDV gate. */ CSR_WRITE_4(sc, RL_MISC, CSR_READ_4(sc, RL_MISC) | 0x00080000); } if ((sc->rl_flags & RL_FLAG_WAIT_TXPOLL) != 0) { for (i = RL_TIMEOUT; i > 0; i--) { if ((CSR_READ_1(sc, sc->rl_txstart) & RL_TXSTART_START) == 0) break; DELAY(20); } if (i == 0) device_printf(sc->rl_dev, "stopping TX poll timed out!\n"); CSR_WRITE_1(sc, RL_COMMAND, 0x00); } else if ((sc->rl_flags & RL_FLAG_CMDSTOP) != 0) { CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_STOPREQ | RL_CMD_TX_ENB | RL_CMD_RX_ENB); if ((sc->rl_flags & RL_FLAG_CMDSTOP_WAIT_TXQ) != 0) { for (i = RL_TIMEOUT; i > 0; i--) { if ((CSR_READ_4(sc, RL_TXCFG) & RL_TXCFG_QUEUE_EMPTY) != 0) break; DELAY(100); } if (i == 0) device_printf(sc->rl_dev, "stopping TXQ timed out!\n"); } } else CSR_WRITE_1(sc, RL_COMMAND, 0x00); DELAY(1000); CSR_WRITE_2(sc, RL_IMR, 0x0000); CSR_WRITE_2(sc, RL_ISR, 0xFFFF); if (sc->rl_head != NULL) { m_freem(sc->rl_head); sc->rl_head = sc->rl_tail = NULL; } /* Free the TX list buffers. */ for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { txd = &sc->rl_ldata.rl_tx_desc[i]; if (txd->tx_m != NULL) { bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->rl_ldata.rl_tx_mtag, txd->tx_dmamap); m_freem(txd->tx_m); txd->tx_m = NULL; } } /* Free the RX list buffers. */ for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { rxd = &sc->rl_ldata.rl_rx_desc[i]; if (rxd->rx_m != NULL) { bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd->rx_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rl_ldata.rl_rx_mtag, rxd->rx_dmamap); m_freem(rxd->rx_m); rxd->rx_m = NULL; } } if ((sc->rl_flags & RL_FLAG_JUMBOV2) != 0) { for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { rxd = &sc->rl_ldata.rl_jrx_desc[i]; if (rxd->rx_m != NULL) { bus_dmamap_sync(sc->rl_ldata.rl_jrx_mtag, rxd->rx_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rl_ldata.rl_jrx_mtag, rxd->rx_dmamap); m_freem(rxd->rx_m); rxd->rx_m = NULL; } } } } /* * Device suspend routine. Stop the interface and save some PCI * settings in case the BIOS doesn't restore them properly on * resume. */ static int re_suspend(device_t dev) { struct rl_softc *sc; sc = device_get_softc(dev); RL_LOCK(sc); re_stop(sc); re_setwol(sc); sc->suspended = 1; RL_UNLOCK(sc); return (0); } /* * Device resume routine. Restore some PCI settings in case the BIOS * doesn't, re-enable busmastering, and restart the interface if * appropriate. */ static int re_resume(device_t dev) { struct rl_softc *sc; struct ifnet *ifp; sc = device_get_softc(dev); RL_LOCK(sc); ifp = sc->rl_ifp; /* Take controller out of sleep mode. */ if ((sc->rl_flags & RL_FLAG_MACSLEEP) != 0) { if ((CSR_READ_1(sc, RL_MACDBG) & 0x80) == 0x80) CSR_WRITE_1(sc, RL_GPIO, CSR_READ_1(sc, RL_GPIO) | 0x01); } /* * Clear WOL matching such that normal Rx filtering * wouldn't interfere with WOL patterns. */ re_clrwol(sc); /* reinitialize interface if necessary */ if (ifp->if_flags & IFF_UP) re_init_locked(sc); sc->suspended = 0; RL_UNLOCK(sc); return (0); } /* * Stop all chip I/O so that the kernel's probe routines don't * get confused by errant DMAs when rebooting. */ static int re_shutdown(device_t dev) { struct rl_softc *sc; sc = device_get_softc(dev); RL_LOCK(sc); re_stop(sc); /* * Mark interface as down since otherwise we will panic if * interrupt comes in later on, which can happen in some * cases. */ sc->rl_ifp->if_flags &= ~IFF_UP; re_setwol(sc); RL_UNLOCK(sc); return (0); } static void re_set_linkspeed(struct rl_softc *sc) { struct mii_softc *miisc; struct mii_data *mii; int aneg, i, phyno; RL_LOCK_ASSERT(sc); mii = device_get_softc(sc->rl_miibus); mii_pollstat(mii); aneg = 0; if ((mii->mii_media_status & (IFM_ACTIVE | IFM_AVALID)) == (IFM_ACTIVE | IFM_AVALID)) { switch IFM_SUBTYPE(mii->mii_media_active) { case IFM_10_T: case IFM_100_TX: return; case IFM_1000_T: aneg++; break; default: break; } } miisc = LIST_FIRST(&mii->mii_phys); phyno = miisc->mii_phy; LIST_FOREACH(miisc, &mii->mii_phys, mii_list) PHY_RESET(miisc); re_miibus_writereg(sc->rl_dev, phyno, MII_100T2CR, 0); re_miibus_writereg(sc->rl_dev, phyno, MII_ANAR, ANAR_TX_FD | ANAR_TX | ANAR_10_FD | ANAR_10 | ANAR_CSMA); re_miibus_writereg(sc->rl_dev, phyno, MII_BMCR, BMCR_AUTOEN | BMCR_STARTNEG); DELAY(1000); if (aneg != 0) { /* * Poll link state until re(4) get a 10/100Mbps link. */ for (i = 0; i < MII_ANEGTICKS_GIGE; i++) { mii_pollstat(mii); if ((mii->mii_media_status & (IFM_ACTIVE | IFM_AVALID)) == (IFM_ACTIVE | IFM_AVALID)) { switch (IFM_SUBTYPE(mii->mii_media_active)) { case IFM_10_T: case IFM_100_TX: return; default: break; } } RL_UNLOCK(sc); pause("relnk", hz); RL_LOCK(sc); } if (i == MII_ANEGTICKS_GIGE) device_printf(sc->rl_dev, "establishing a link failed, WOL may not work!"); } /* * No link, force MAC to have 100Mbps, full-duplex link. * MAC does not require reprogramming on resolved speed/duplex, * so this is just for completeness. */ mii->mii_media_status = IFM_AVALID | IFM_ACTIVE; mii->mii_media_active = IFM_ETHER | IFM_100_TX | IFM_FDX; } static void re_setwol(struct rl_softc *sc) { struct ifnet *ifp; int pmc; uint16_t pmstat; uint8_t v; RL_LOCK_ASSERT(sc); if (pci_find_cap(sc->rl_dev, PCIY_PMG, &pmc) != 0) return; ifp = sc->rl_ifp; /* Put controller into sleep mode. */ if ((sc->rl_flags & RL_FLAG_MACSLEEP) != 0) { if ((CSR_READ_1(sc, RL_MACDBG) & 0x80) == 0x80) CSR_WRITE_1(sc, RL_GPIO, CSR_READ_1(sc, RL_GPIO) & ~0x01); } if ((ifp->if_capenable & IFCAP_WOL) != 0) { if ((sc->rl_flags & RL_FLAG_8168G_PLUS) != 0) { /* Disable RXDV gate. */ CSR_WRITE_4(sc, RL_MISC, CSR_READ_4(sc, RL_MISC) & ~0x00080000); } re_set_rxmode(sc); if ((sc->rl_flags & RL_FLAG_WOL_MANLINK) != 0) re_set_linkspeed(sc); if ((sc->rl_flags & RL_FLAG_WOLRXENB) != 0) CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_RX_ENB); } /* Enable config register write. */ CSR_WRITE_1(sc, RL_EECMD, RL_EE_MODE); /* Enable PME. */ v = CSR_READ_1(sc, sc->rl_cfg1); v &= ~RL_CFG1_PME; if ((ifp->if_capenable & IFCAP_WOL) != 0) v |= RL_CFG1_PME; CSR_WRITE_1(sc, sc->rl_cfg1, v); v = CSR_READ_1(sc, sc->rl_cfg3); v &= ~(RL_CFG3_WOL_LINK | RL_CFG3_WOL_MAGIC); if ((ifp->if_capenable & IFCAP_WOL_MAGIC) != 0) v |= RL_CFG3_WOL_MAGIC; CSR_WRITE_1(sc, sc->rl_cfg3, v); v = CSR_READ_1(sc, sc->rl_cfg5); v &= ~(RL_CFG5_WOL_BCAST | RL_CFG5_WOL_MCAST | RL_CFG5_WOL_UCAST | RL_CFG5_WOL_LANWAKE); if ((ifp->if_capenable & IFCAP_WOL_UCAST) != 0) v |= RL_CFG5_WOL_UCAST; if ((ifp->if_capenable & IFCAP_WOL_MCAST) != 0) v |= RL_CFG5_WOL_MCAST | RL_CFG5_WOL_BCAST; if ((ifp->if_capenable & IFCAP_WOL) != 0) v |= RL_CFG5_WOL_LANWAKE; CSR_WRITE_1(sc, sc->rl_cfg5, v); /* Config register write done. */ CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); if ((ifp->if_capenable & IFCAP_WOL) == 0 && (sc->rl_flags & RL_FLAG_PHYWAKE_PM) != 0) CSR_WRITE_1(sc, RL_PMCH, CSR_READ_1(sc, RL_PMCH) & ~0x80); /* * It seems that hardware resets its link speed to 100Mbps in * power down mode so switching to 100Mbps in driver is not * needed. */ /* Request PME if WOL is requested. */ pmstat = pci_read_config(sc->rl_dev, pmc + PCIR_POWER_STATUS, 2); pmstat &= ~(PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE); if ((ifp->if_capenable & IFCAP_WOL) != 0) pmstat |= PCIM_PSTAT_PME | PCIM_PSTAT_PMEENABLE; pci_write_config(sc->rl_dev, pmc + PCIR_POWER_STATUS, pmstat, 2); } static void re_clrwol(struct rl_softc *sc) { int pmc; uint8_t v; RL_LOCK_ASSERT(sc); if (pci_find_cap(sc->rl_dev, PCIY_PMG, &pmc) != 0) return; /* Enable config register write. */ CSR_WRITE_1(sc, RL_EECMD, RL_EE_MODE); v = CSR_READ_1(sc, sc->rl_cfg3); v &= ~(RL_CFG3_WOL_LINK | RL_CFG3_WOL_MAGIC); CSR_WRITE_1(sc, sc->rl_cfg3, v); /* Config register write done. */ CSR_WRITE_1(sc, RL_EECMD, RL_EEMODE_OFF); v = CSR_READ_1(sc, sc->rl_cfg5); v &= ~(RL_CFG5_WOL_BCAST | RL_CFG5_WOL_MCAST | RL_CFG5_WOL_UCAST); v &= ~RL_CFG5_WOL_LANWAKE; CSR_WRITE_1(sc, sc->rl_cfg5, v); } static void re_add_sysctls(struct rl_softc *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; int error; ctx = device_get_sysctl_ctx(sc->rl_dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->rl_dev)); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "stats", CTLTYPE_INT | CTLFLAG_RW, sc, 0, re_sysctl_stats, "I", "Statistics Information"); if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) return; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "int_rx_mod", CTLTYPE_INT | CTLFLAG_RW, &sc->rl_int_rx_mod, 0, sysctl_hw_re_int_mod, "I", "re RX interrupt moderation"); /* Pull in device tunables. */ sc->rl_int_rx_mod = RL_TIMER_DEFAULT; error = resource_int_value(device_get_name(sc->rl_dev), device_get_unit(sc->rl_dev), "int_rx_mod", &sc->rl_int_rx_mod); if (error == 0) { if (sc->rl_int_rx_mod < RL_TIMER_MIN || sc->rl_int_rx_mod > RL_TIMER_MAX) { device_printf(sc->rl_dev, "int_rx_mod value out of " "range; using default: %d\n", RL_TIMER_DEFAULT); sc->rl_int_rx_mod = RL_TIMER_DEFAULT; } } } static int re_sysctl_stats(SYSCTL_HANDLER_ARGS) { struct rl_softc *sc; struct rl_stats *stats; int error, i, result; result = -1; error = sysctl_handle_int(oidp, &result, 0, req); if (error || req->newptr == NULL) return (error); if (result == 1) { sc = (struct rl_softc *)arg1; RL_LOCK(sc); if ((sc->rl_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { RL_UNLOCK(sc); goto done; } bus_dmamap_sync(sc->rl_ldata.rl_stag, sc->rl_ldata.rl_smap, BUS_DMASYNC_PREREAD); CSR_WRITE_4(sc, RL_DUMPSTATS_HI, RL_ADDR_HI(sc->rl_ldata.rl_stats_addr)); CSR_WRITE_4(sc, RL_DUMPSTATS_LO, RL_ADDR_LO(sc->rl_ldata.rl_stats_addr)); CSR_WRITE_4(sc, RL_DUMPSTATS_LO, RL_ADDR_LO(sc->rl_ldata.rl_stats_addr | RL_DUMPSTATS_START)); for (i = RL_TIMEOUT; i > 0; i--) { if ((CSR_READ_4(sc, RL_DUMPSTATS_LO) & RL_DUMPSTATS_START) == 0) break; DELAY(1000); } bus_dmamap_sync(sc->rl_ldata.rl_stag, sc->rl_ldata.rl_smap, BUS_DMASYNC_POSTREAD); RL_UNLOCK(sc); if (i == 0) { device_printf(sc->rl_dev, "DUMP statistics request timed out\n"); return (ETIMEDOUT); } done: stats = sc->rl_ldata.rl_stats; printf("%s statistics:\n", device_get_nameunit(sc->rl_dev)); printf("Tx frames : %ju\n", (uintmax_t)le64toh(stats->rl_tx_pkts)); printf("Rx frames : %ju\n", (uintmax_t)le64toh(stats->rl_rx_pkts)); printf("Tx errors : %ju\n", (uintmax_t)le64toh(stats->rl_tx_errs)); printf("Rx errors : %u\n", le32toh(stats->rl_rx_errs)); printf("Rx missed frames : %u\n", (uint32_t)le16toh(stats->rl_missed_pkts)); printf("Rx frame alignment errs : %u\n", (uint32_t)le16toh(stats->rl_rx_framealign_errs)); printf("Tx single collisions : %u\n", le32toh(stats->rl_tx_onecoll)); printf("Tx multiple collisions : %u\n", le32toh(stats->rl_tx_multicolls)); printf("Rx unicast frames : %ju\n", (uintmax_t)le64toh(stats->rl_rx_ucasts)); printf("Rx broadcast frames : %ju\n", (uintmax_t)le64toh(stats->rl_rx_bcasts)); printf("Rx multicast frames : %u\n", le32toh(stats->rl_rx_mcasts)); printf("Tx aborts : %u\n", (uint32_t)le16toh(stats->rl_tx_aborts)); printf("Tx underruns : %u\n", (uint32_t)le16toh(stats->rl_rx_underruns)); } return (error); } static int sysctl_int_range(SYSCTL_HANDLER_ARGS, int low, int high) { int error, value; if (arg1 == NULL) return (EINVAL); value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value < low || value > high) return (EINVAL); *(int *)arg1 = value; return (0); } static int sysctl_hw_re_int_mod(SYSCTL_HANDLER_ARGS) { return (sysctl_int_range(oidp, arg1, arg2, req, RL_TIMER_MIN, RL_TIMER_MAX)); } Index: user/alc/PQ_LAUNDRY/sys/fs/cd9660/cd9660_vnops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/fs/cd9660/cd9660_vnops.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/fs/cd9660/cd9660_vnops.c (revision 308054) @@ -1,874 +1,916 @@ /*- * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vnops.c 8.19 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include static vop_setattr_t cd9660_setattr; static vop_open_t cd9660_open; static vop_access_t cd9660_access; static vop_getattr_t cd9660_getattr; static vop_ioctl_t cd9660_ioctl; static vop_pathconf_t cd9660_pathconf; static vop_read_t cd9660_read; struct isoreaddir; static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off); static int iso_shipdir(struct isoreaddir *idp); static vop_readdir_t cd9660_readdir; static vop_readlink_t cd9660_readlink; static vop_strategy_t cd9660_strategy; static vop_vptofh_t cd9660_vptofh; +static vop_getpages_t cd9660_getpages; /* * Setattr call. Only allowed for block and character special devices. */ static int cd9660_setattr(ap) struct vop_setattr_args /* { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) return (EROFS); if (vap->va_size != (u_quad_t)VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: return (EROFS); case VCHR: case VBLK: case VSOCK: case VFIFO: case VNON: case VBAD: case VMARKER: return (0); } } return (0); } /* * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. * The mode is shifted to select the owner/group/other fields. The * super user is granted all permissions. */ /* ARGSUSED */ static int cd9660_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); /* * Disallow write attempts unless the file is a socket, * fifo, or a block or character device resident on the * filesystem. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: return (EROFS); /* NOT REACHED */ default: break; } } return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid, ip->inode.iso_gid, ap->a_accmode, ap->a_cred, NULL)); } static int cd9660_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; } */ *ap; { struct vnode *vp = ap->a_vp; struct iso_node *ip = VTOI(vp); if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); vnode_create_vobject(vp, ip->i_size, ap->a_td); return (0); } static int cd9660_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct iso_node *ip = VTOI(vp); vap->va_fsid = dev2udev(ip->i_mnt->im_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->inode.iso_mode; vap->va_nlink = ip->inode.iso_links; vap->va_uid = ip->inode.iso_uid; vap->va_gid = ip->inode.iso_gid; vap->va_atime = ip->inode.iso_atime; vap->va_mtime = ip->inode.iso_mtime; vap->va_ctime = ip->inode.iso_ctime; vap->va_rdev = ip->inode.iso_rdev; vap->va_size = (u_quad_t) ip->i_size; if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) { struct vop_readlink_args rdlnk; struct iovec aiov; struct uio auio; char *cp; cp = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = curthread; auio.uio_resid = MAXPATHLEN; rdlnk.a_uio = &auio; rdlnk.a_vp = ap->a_vp; rdlnk.a_cred = ap->a_cred; if (cd9660_readlink(&rdlnk) == 0) vap->va_size = MAXPATHLEN - auio.uio_resid; free(cp, M_TEMP); } vap->va_flags = 0; vap->va_gen = 1; vap->va_blocksize = ip->i_mnt->logical_block_size; vap->va_bytes = (u_quad_t) ip->i_size; vap->va_type = vp->v_type; vap->va_filerev = 0; return (0); } /* * Vnode op for ioctl. */ static int cd9660_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp; struct iso_node *ip; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); return (EBADF); } if (vp->v_type == VCHR || vp->v_type == VBLK) { VOP_UNLOCK(vp, 0); return (EOPNOTSUPP); } ip = VTOI(vp); error = 0; switch (ap->a_command) { case FIOGETLBA: *(int *)(ap->a_data) = ip->iso_start; break; default: error = ENOTTY; break; } VOP_UNLOCK(vp, 0); return (error); } /* * Vnode op for reading. */ static int cd9660_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct iso_node *ip = VTOI(vp); struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; int rasize, error = 0; int seqcount; long size, n, on; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); seqcount = ap->a_ioflag >> IO_SEQSHIFT; if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); imp = ip->i_mnt; do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); n = MIN(imp->logical_block_size - on, uio->uio_resid); diff = (off_t)ip->i_size - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; size = blksize(imp, ip, lbn); rablock = lbn + 1; if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { if (lblktosize(imp, rablock) < ip->i_size) error = cluster_read(vp, (off_t)ip->i_size, lbn, size, NOCRED, uio->uio_resid, (ap->a_ioflag >> 16), 0, &bp); else error = bread(vp, lbn, size, NOCRED, &bp); } else { if (seqcount > 1 && lblktosize(imp, rablock) < ip->i_size) { rasize = blksize(imp, ip, rablock); error = breadn(vp, lbn, size, &rablock, &rasize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); } if (error != 0) return (error); n = MIN(n, size - bp->b_resid); error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); return (error); } /* * Structure for reading directories */ struct isoreaddir { struct dirent saveent; struct dirent assocent; struct dirent current; off_t saveoff; off_t assocoff; off_t curroff; struct uio *uio; off_t uio_off; int eofflag; u_long *cookies; int ncookies; }; static int iso_uiodir(idp,dp,off) struct isoreaddir *idp; struct dirent *dp; off_t off; { int error; dp->d_name[dp->d_namlen] = 0; dp->d_reclen = GENERIC_DIRSIZ(dp); if (idp->uio->uio_resid < dp->d_reclen) { idp->eofflag = 0; return (-1); } if (idp->cookies) { if (idp->ncookies <= 0) { idp->eofflag = 0; return (-1); } *idp->cookies++ = off; --idp->ncookies; } if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0) return (error); idp->uio_off = off; return (0); } static int iso_shipdir(idp) struct isoreaddir *idp; { struct dirent *dp; int cl, sl, assoc; int error; char *cname, *sname; cl = idp->current.d_namlen; cname = idp->current.d_name; assoc = (cl > 1) && (*cname == ASSOCCHAR); if (assoc) { cl--; cname++; } dp = &idp->saveent; sname = dp->d_name; if (!(sl = dp->d_namlen)) { dp = &idp->assocent; sname = dp->d_name + 1; sl = dp->d_namlen - 1; } if (sl > 0) { if (sl != cl || bcmp(sname,cname,sl)) { if (idp->assocent.d_namlen) { if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0) return (error); idp->assocent.d_namlen = 0; } if (idp->saveent.d_namlen) { if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0) return (error); idp->saveent.d_namlen = 0; } } } idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current); if (assoc) { idp->assocoff = idp->curroff; bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); } else { idp->saveoff = idp->curroff; bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); } return (0); } /* * Vnode op for readdir */ static int cd9660_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct uio *uio = ap->a_uio; struct isoreaddir *idp; struct vnode *vdp = ap->a_vp; struct iso_node *dp; struct iso_mnt *imp; struct buf *bp = NULL; struct iso_directory_record *ep; int entryoffsetinblock; doff_t endsearch; u_long bmask; int error = 0; int reclen; u_short namelen; int ncookies = 0; u_long *cookies = NULL; dp = VTOI(vdp); imp = dp->i_mnt; bmask = imp->im_bmask; idp = malloc(sizeof(*idp), M_TEMP, M_WAITOK); idp->saveent.d_namlen = idp->assocent.d_namlen = 0; /* * XXX * Is it worth trying to figure out the type? */ idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type = DT_UNKNOWN; idp->uio = uio; if (ap->a_ncookies == NULL) { idp->cookies = NULL; } else { /* * Guess the number of cookies needed. */ ncookies = uio->uio_resid / 16; cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); idp->cookies = cookies; idp->ncookies = ncookies; } idp->eofflag = 1; idp->curroff = uio->uio_offset; idp->uio_off = uio->uio_offset; if ((entryoffsetinblock = idp->curroff & bmask) && (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) { free(idp, M_TEMP); return (error); } endsearch = dp->i_size; while (idp->curroff < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if ((idp->curroff & bmask) == 0) { if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0) break; entryoffsetinblock = 0; } /* * Get pointer to next entry. */ ep = (struct iso_directory_record *) ((char *)bp->b_data + entryoffsetinblock); reclen = isonum_711(ep->length); if (reclen == 0) { /* skip to next block, if any */ idp->curroff = (idp->curroff & ~bmask) + imp->logical_block_size; continue; } if (reclen < ISO_DIRECTORY_RECORD_SIZE) { error = EINVAL; /* illegal entry, stop */ break; } if (entryoffsetinblock + reclen > imp->logical_block_size) { error = EINVAL; /* illegal directory, so stop looking */ break; } idp->current.d_namlen = isonum_711(ep->name_len); if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { error = EINVAL; /* illegal entry, stop */ break; } if (isonum_711(ep->flags)&2) idp->current.d_fileno = isodirino(ep, imp); else idp->current.d_fileno = dbtob(bp->b_blkno) + entryoffsetinblock; idp->curroff += reclen; switch (imp->iso_ftype) { case ISO_FTYPE_RRIP: cd9660_rrip_getname(ep,idp->current.d_name, &namelen, &idp->current.d_fileno,imp); idp->current.d_namlen = (u_char)namelen; if (idp->current.d_namlen) error = iso_uiodir(idp,&idp->current,idp->curroff); break; default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/ strcpy(idp->current.d_name,".."); if (idp->current.d_namlen == 1 && ep->name[0] == 0) { idp->current.d_namlen = 1; error = iso_uiodir(idp,&idp->current,idp->curroff); } else if (idp->current.d_namlen == 1 && ep->name[0] == 1) { idp->current.d_namlen = 2; error = iso_uiodir(idp,&idp->current,idp->curroff); } else { isofntrans(ep->name,idp->current.d_namlen, idp->current.d_name, &namelen, imp->iso_ftype == ISO_FTYPE_9660, isonum_711(ep->flags)&4, imp->joliet_level, imp->im_flags, imp->im_d2l); idp->current.d_namlen = (u_char)namelen; if (imp->iso_ftype == ISO_FTYPE_DEFAULT) error = iso_shipdir(idp); else error = iso_uiodir(idp,&idp->current,idp->curroff); } } if (error) break; entryoffsetinblock += reclen; } if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { idp->current.d_namlen = 0; error = iso_shipdir(idp); } if (error < 0) error = 0; if (ap->a_ncookies != NULL) { if (error) free(cookies, M_TEMP); else { /* * Work out the number of cookies actually used. */ *ap->a_ncookies = ncookies - idp->ncookies; *ap->a_cookies = cookies; } } if (bp) brelse (bp); uio->uio_offset = idp->uio_off; *ap->a_eofflag = idp->eofflag; free(idp, M_TEMP); return (error); } /* * Return target name of a symbolic link * Shouldn't we get the parent vnode and read the data from there? * This could eventually result in deadlocks in cd9660_lookup. * But otherwise the block read here is in the block buffer two times. */ typedef struct iso_directory_record ISODIR; typedef struct iso_node ISONODE; typedef struct iso_mnt ISOMNT; static int cd9660_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { ISONODE *ip; ISODIR *dirp; ISOMNT *imp; struct buf *bp; struct uio *uio; u_short symlen; int error; char *symname; ip = VTOI(ap->a_vp); imp = ip->i_mnt; uio = ap->a_uio; if (imp->iso_ftype != ISO_FTYPE_RRIP) return (EINVAL); /* * Get parents directory record block that this inode included. */ error = bread(imp->im_devvp, (ip->i_number >> imp->im_bshift) << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { brelse(bp); return (EINVAL); } /* * Setup the directory pointer for this inode */ dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask)); /* * Just make sure, we have a right one.... * 1: Check not cross boundary on block */ if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) > (unsigned)imp->logical_block_size) { brelse(bp); return (EINVAL); } /* * Now get a buffer * Abuse a namei buffer for now. */ if (uio->uio_segflg == UIO_SYSSPACE) symname = uio->uio_iov->iov_base; else symname = uma_zalloc(namei_zone, M_WAITOK); /* * Ok, we just gathering a symbolic name in SL record. */ if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) { if (uio->uio_segflg != UIO_SYSSPACE) uma_zfree(namei_zone, symname); brelse(bp); return (EINVAL); } /* * Don't forget before you leave from home ;-) */ brelse(bp); /* * return with the symbolic name to caller's. */ if (uio->uio_segflg != UIO_SYSSPACE) { error = uiomove(symname, symlen, uio); uma_zfree(namei_zone, symname); return (error); } uio->uio_resid -= symlen; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen; uio->uio_iov->iov_len -= symlen; return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ static int cd9660_strategy(ap) struct vop_strategy_args /* { struct buf *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct iso_node *ip; struct bufobj *bo; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { bp->b_blkno = (ip->iso_start + bp->b_lblkno) << (ip->i_mnt->im_bshift - DEV_BSHIFT); } bp->b_iooffset = dbtob(bp->b_blkno); bo = ip->i_mnt->im_bo; BO_STRATEGY(bo, bp); return (0); } /* * Return POSIX pathconf information applicable to cd9660 filesystems. */ static int cd9660_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) *ap->a_retval = NAME_MAX; else *ap->a_retval = 37; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Vnode pointer to File handle */ static int cd9660_vptofh(ap) struct vop_vptofh_args /* { struct vnode *a_vp; struct fid *a_fhp; } */ *ap; { struct ifid ifh; struct iso_node *ip = VTOI(ap->a_vp); ifh.ifid_len = sizeof(struct ifid); ifh.ifid_ino = ip->i_number; ifh.ifid_start = ip->iso_start; /* * This intentionally uses sizeof(ifh) in order to not copy stack * garbage on ILP32. */ memcpy(ap->a_fhp, &ifh, sizeof(ifh)); #ifdef ISOFS_DBG printf("vptofh: ino %d, start %ld\n", ifh.ifid_ino, ifh.ifid_start); #endif return (0); } +SYSCTL_NODE(_vfs, OID_AUTO, cd9660, CTLFLAG_RW, 0, "cd9660 filesystem"); +static int use_buf_pager = 1; +SYSCTL_INT(_vfs_cd9660, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, + &use_buf_pager, 0, + "Use buffer pager instead of bmap"); + +static daddr_t +cd9660_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) +{ + + return (lblkno(VTOI(vp)->i_mnt, off)); +} + +static int +cd9660_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + struct iso_node *ip; + + ip = VTOI(vp); + return (blksize(ip->i_mnt, ip, lbn)); +} + +static int +cd9660_getpages(struct vop_getpages_args *ap) +{ + struct vnode *vp; + + vp = ap->a_vp; + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (EOPNOTSUPP); + + if (use_buf_pager) + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, cd9660_gbp_getblkno, + cd9660_gbp_getblksz)); + return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); +} + /* * Global vfs data structures for cd9660 */ struct vop_vector cd9660_vnodeops = { .vop_default = &default_vnodeops, .vop_open = cd9660_open, .vop_access = cd9660_access, .vop_bmap = cd9660_bmap, .vop_cachedlookup = cd9660_lookup, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_ioctl = cd9660_ioctl, .vop_lookup = vfs_cache_lookup, .vop_pathconf = cd9660_pathconf, .vop_read = cd9660_read, .vop_readdir = cd9660_readdir, .vop_readlink = cd9660_readlink, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, .vop_strategy = cd9660_strategy, .vop_vptofh = cd9660_vptofh, + .vop_getpages = cd9660_getpages, }; /* * Special device vnode ops */ struct vop_vector cd9660_fifoops = { .vop_default = &fifo_specops, .vop_access = cd9660_access, .vop_getattr = cd9660_getattr, .vop_inactive = cd9660_inactive, .vop_reclaim = cd9660_reclaim, .vop_setattr = cd9660_setattr, .vop_vptofh = cd9660_vptofh, }; Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_fat.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_fat.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_fat.c (revision 308054) @@ -1,1134 +1,1163 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_fat.c,v 1.28 1997/11/17 15:36:49 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include +#define FULL_RUN ((u_int)0xffffffff) + static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop); static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith); static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp); static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn); static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn); static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn); static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop) { u_long bn, size; bn = ofs / pmp->pm_fatblocksize * pmp->pm_fatblocksec; size = min(pmp->pm_fatblocksec, pmp->pm_FATsecs - bn) * DEV_BSIZE; bn += pmp->pm_fatblk + pmp->pm_curfat * pmp->pm_FATsecs; if (bnp) *bnp = bn; if (sizep) *sizep = size; if (bop) *bop = ofs % pmp->pm_fatblocksize; } /* * Map the logical cluster number of a file into a physical disk sector * that is filesystem relative. * * dep - address of denode representing the file of interest * findcn - file relative cluster whose filesystem relative cluster number * and/or block number are/is to be found * bnp - address of where to place the filesystem relative block number. * If this pointer is null then don't return this quantity. * cnp - address of where to place the filesystem relative cluster number. * If this pointer is null then don't return this quantity. * sp - pointer to returned block size * * NOTE: Either bnp or cnp must be non-null. * This function has one side effect. If the requested file relative cluster * is beyond the end of file, then the actual number of clusters in the file * is returned in *cnp. This is useful for determining how long a directory is. * If cnp is null, nothing is returned. */ int pcbmap(struct denode *dep, u_long findcn, daddr_t *bnp, u_long *cnp, int *sp) { int error; u_long i; u_long cn; u_long prevcn = 0; /* XXX: prevcn could be used unititialized */ u_long byteoffset; u_long bn; u_long bo; struct buf *bp = NULL; u_long bp_bn = -1; struct msdosfsmount *pmp = dep->de_pmp; u_long bsize; KASSERT(bnp != NULL || cnp != NULL || sp != NULL, ("pcbmap: extra call")); ASSERT_VOP_ELOCKED(DETOV(dep), "pcbmap"); cn = dep->de_StartCluster; /* * The "file" that makes up the root directory is contiguous, * permanently allocated, of fixed size, and is not made up of * clusters. If the cluster number is beyond the end of the root * directory, then return the number of clusters in the file. */ if (cn == MSDOSFSROOT) { if (dep->de_Attributes & ATTR_DIRECTORY) { if (de_cn2off(pmp, findcn) >= dep->de_FileSize) { if (cnp) *cnp = de_bn2cn(pmp, pmp->pm_rootdirsize); return (E2BIG); } if (bnp) *bnp = pmp->pm_rootdirblk + de_cn2bn(pmp, findcn); if (cnp) *cnp = MSDOSFSROOT; if (sp) *sp = min(pmp->pm_bpcluster, dep->de_FileSize - de_cn2off(pmp, findcn)); return (0); } else { /* just an empty file */ if (cnp) *cnp = 0; return (E2BIG); } } /* * All other files do I/O in cluster sized blocks */ if (sp) *sp = pmp->pm_bpcluster; /* * Rummage around in the fat cache, maybe we can avoid tromping * through every fat entry for the file. And, keep track of how far * off the cache was from where we wanted to be. */ i = 0; fc_lookup(dep, findcn, &i, &cn); /* * Handle all other files or directories the normal way. */ for (; i < findcn; i++) { /* * Stop with all reserved clusters, not just with EOF. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) goto hiteof; byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (bn != bp_bn) { if (bp) brelse(bp); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp_bn = bn; } prevcn = cn; if (bo >= bsize) { if (bp) brelse(bp); return (EIO); } if (FAT32(pmp)) cn = getulong(&bp->b_data[bo]); else cn = getushort(&bp->b_data[bo]); if (FAT12(pmp) && (prevcn & 1)) cn >>= 4; cn &= pmp->pm_fatmask; /* * Force the special cluster numbers * to be the same for all cluster sizes * to let the rest of msdosfs handle * all cases the same. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) cn |= ~pmp->pm_fatmask; } if (!MSDOSFSEOF(pmp, cn)) { if (bp) brelse(bp); if (bnp) *bnp = cntobn(pmp, cn); if (cnp) *cnp = cn; fc_setcache(dep, FC_LASTMAP, i, cn); return (0); } hiteof:; if (cnp) *cnp = i; if (bp) brelse(bp); /* update last file cluster entry in the fat cache */ fc_setcache(dep, FC_LASTFC, i - 1, prevcn); return (E2BIG); } /* * Find the closest entry in the fat cache to the cluster we are looking * for. */ static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp) { int i; u_long cn; struct fatcache *closest = NULL; ASSERT_VOP_LOCKED(DETOV(dep), "fc_lookup"); for (i = 0; i < FC_SIZE; i++) { cn = dep->de_fc[i].fc_frcn; if (cn != FCE_EMPTY && cn <= findcn) { if (closest == NULL || cn > closest->fc_frcn) closest = &dep->de_fc[i]; } } if (closest) { *frcnp = closest->fc_frcn; *fsrcnp = closest->fc_fsrcn; } } /* * Purge the fat cache in denode dep of all entries relating to file * relative cluster frcn and beyond. */ void fc_purge(struct denode *dep, u_int frcn) { int i; struct fatcache *fcp; ASSERT_VOP_ELOCKED(DETOV(dep), "fc_purge"); fcp = dep->de_fc; for (i = 0; i < FC_SIZE; i++, fcp++) { if (fcp->fc_frcn >= frcn) fcp->fc_frcn = FCE_EMPTY; } } /* * Update the fat. * If mirroring the fat, update all copies, with the first copy as last. * Else update only the current fat (ignoring the others). * * pmp - msdosfsmount structure for filesystem to update * bp - addr of modified fat block * fatbn - block number relative to begin of filesystem of the modified fat block. */ static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn) { struct buf *bpn; int cleanfat, i; #ifdef MSDOSFS_DEBUG printf("updatefats(pmp %p, bp %p, fatbn %lu)\n", pmp, bp, fatbn); #endif if (pmp->pm_flags & MSDOSFS_FATMIRROR) { /* * Now copy the block(s) of the modified fat to the other copies of * the fat and write them out. This is faster than reading in the * other fats and then writing them back out. This could tie up * the fat for quite a while. Preventing others from accessing it. * To prevent us from going after the fat quite so much we use * delayed writes, unless they specfied "synchronous" when the * filesystem was mounted. If synch is asked for then use * bwrite()'s and really slow things down. */ if (fatbn != pmp->pm_fatblk || FAT12(pmp)) cleanfat = 0; else if (FAT16(pmp)) cleanfat = 16; else cleanfat = 32; for (i = 1; i < pmp->pm_FATs; i++) { fatbn += pmp->pm_FATsecs; /* getblk() never fails */ bpn = getblk(pmp->pm_devvp, fatbn, bp->b_bcount, 0, 0, 0); bcopy(bp->b_data, bpn->b_data, bp->b_bcount); /* Force the clean bit on in the other copies. */ if (cleanfat == 16) ((u_int8_t *)bpn->b_data)[3] |= 0x80; else if (cleanfat == 32) ((u_int8_t *)bpn->b_data)[7] |= 0x08; if (pmp->pm_mountp->mnt_flag & MNT_SYNCHRONOUS) bwrite(bpn); else bdwrite(bpn); } } /* * Write out the first (or current) fat last. */ if (pmp->pm_mountp->mnt_flag & MNT_SYNCHRONOUS) bwrite(bp); else bdwrite(bp); } /* * Updating entries in 12 bit fats is a pain in the butt. * * The following picture shows where nibbles go when moving from a 12 bit * cluster number into the appropriate bytes in the FAT. * * byte m byte m+1 byte m+2 * +----+----+ +----+----+ +----+----+ * | 0 1 | | 2 3 | | 4 5 | FAT bytes * +----+----+ +----+----+ +----+----+ * * +----+----+----+ +----+----+----+ * | 3 0 1 | | 4 5 2 | * +----+----+----+ +----+----+----+ * cluster n cluster n+1 * * Where n is even. m = n + (n >> 2) * */ static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn) { MSDOSFS_ASSERT_MP_LOCKED(pmp); + KASSERT(cn <= pmp->pm_maxcluster, ("cn too large %lu %lu", cn, + pmp->pm_maxcluster)); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("usemap_alloc on ro msdosfs mount")); KASSERT((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) == 0, ("Allocating used sector %ld %ld %x", cn, cn % N_INUSEBITS, (unsigned)pmp->pm_inusemap[cn / N_INUSEBITS])); pmp->pm_inusemap[cn / N_INUSEBITS] |= 1 << (cn % N_INUSEBITS); KASSERT(pmp->pm_freeclustercount > 0, ("usemap_alloc: too little")); pmp->pm_freeclustercount--; pmp->pm_flags |= MSDOSFS_FSIMOD; } static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn) { MSDOSFS_ASSERT_MP_LOCKED(pmp); + + KASSERT(cn <= pmp->pm_maxcluster, ("cn too large %lu %lu", cn, + pmp->pm_maxcluster)); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("usemap_free on ro msdosfs mount")); pmp->pm_freeclustercount++; pmp->pm_flags |= MSDOSFS_FSIMOD; KASSERT((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) != 0, ("Freeing unused sector %ld %ld %x", cn, cn % N_INUSEBITS, (unsigned)pmp->pm_inusemap[cn / N_INUSEBITS])); pmp->pm_inusemap[cn / N_INUSEBITS] &= ~(1 << (cn % N_INUSEBITS)); } int clusterfree(struct msdosfsmount *pmp, u_long cluster, u_long *oldcnp) { int error; u_long oldcn; error = fatentry(FAT_GET_AND_SET, pmp, cluster, &oldcn, MSDOSFSFREE); if (error) return (error); /* * If the cluster was successfully marked free, then update * the count of free clusters, and turn off the "allocated" * bit in the "in use" cluster bit map. */ MSDOSFS_LOCK_MP(pmp); usemap_free(pmp, cluster); MSDOSFS_UNLOCK_MP(pmp); if (oldcnp) *oldcnp = oldcn; return (0); } /* * Get or Set or 'Get and Set' the cluster'th entry in the fat. * * function - whether to get or set a fat entry * pmp - address of the msdosfsmount structure for the filesystem * whose fat is to be manipulated. * cn - which cluster is of interest * oldcontents - address of a word that is to receive the contents of the * cluster'th entry if this is a get function * newcontents - the new value to be written into the cluster'th element of * the fat if this is a set function. * * This function can also be used to free a cluster by setting the fat entry * for a cluster to 0. * * All copies of the fat are updated if this is a set function. NOTE: If * fatentry() marks a cluster as free it does not update the inusemap in * the msdosfsmount structure. This is left to the caller. */ int fatentry(int function, struct msdosfsmount *pmp, u_long cn, u_long *oldcontents, u_long newcontents) { int error; u_long readcn; u_long bn, bo, bsize, byteoffset; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatentry(func %d, pmp %p, clust %lu, oldcon %p, newcon %lx)\n", function, pmp, cn, oldcontents, newcontents); #endif #ifdef DIAGNOSTIC /* * Be sure they asked us to do something. */ if ((function & (FAT_SET | FAT_GET)) == 0) { #ifdef MSDOSFS_DEBUG printf("fatentry(): function code doesn't specify get or set\n"); #endif return (EINVAL); } /* * If they asked us to return a cluster number but didn't tell us * where to put it, give them an error. */ if ((function & FAT_GET) && oldcontents == NULL) { #ifdef MSDOSFS_DEBUG printf("fatentry(): get function with no place to put result\n"); #endif return (EINVAL); } #endif /* * Be sure the requested cluster is in the filesystem. */ if (cn < CLUST_FIRST || cn > pmp->pm_maxcluster) return (EINVAL); byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } if (function & FAT_GET) { if (FAT32(pmp)) readcn = getulong(&bp->b_data[bo]); else readcn = getushort(&bp->b_data[bo]); if (FAT12(pmp) & (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; /* map reserved fat entries to same values for all fats */ if ((readcn | ~pmp->pm_fatmask) >= CLUST_RSRVD) readcn |= ~pmp->pm_fatmask; *oldcontents = readcn; } if (function & FAT_SET) { switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(&bp->b_data[bo]); if (cn & 1) { readcn &= 0x000f; readcn |= newcontents << 4; } else { readcn &= 0xf000; readcn |= newcontents & 0xfff; } putushort(&bp->b_data[bo], readcn); break; case FAT16_MASK: putushort(&bp->b_data[bo], newcontents); break; case FAT32_MASK: /* * According to spec we have to retain the * high order bits of the fat entry. */ readcn = getulong(&bp->b_data[bo]); readcn &= ~FAT32_MASK; readcn |= newcontents & FAT32_MASK; putulong(&bp->b_data[bo], readcn); break; } updatefats(pmp, bp, bn); bp = NULL; pmp->pm_fmod = 1; } if (bp) brelse(bp); return (0); } /* * Update a contiguous cluster chain * * pmp - mount point * start - first cluster of chain * count - number of clusters in chain * fillwith - what to write into fat entry of last cluster */ static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith) { int error; u_long bn, bo, bsize, byteoffset, readcn, newc; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatchain(pmp %p, start %lu, count %lu, fillwith %lx)\n", pmp, start, count, fillwith); #endif /* * Be sure the clusters are in the filesystem. */ if (start < CLUST_FIRST || start + count - 1 > pmp->pm_maxcluster) return (EINVAL); while (count > 0) { byteoffset = FATOFS(pmp, start); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } while (count > 0) { start++; newc = --count > 0 ? start : fillwith; switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(&bp->b_data[bo]); if (start & 1) { readcn &= 0xf000; readcn |= newc & 0xfff; } else { readcn &= 0x000f; readcn |= newc << 4; } putushort(&bp->b_data[bo], readcn); bo++; if (!(start & 1)) bo++; break; case FAT16_MASK: putushort(&bp->b_data[bo], newc); bo += 2; break; case FAT32_MASK: readcn = getulong(&bp->b_data[bo]); readcn &= ~pmp->pm_fatmask; readcn |= newc & pmp->pm_fatmask; putulong(&bp->b_data[bo], readcn); bo += 4; break; } if (bo >= bsize) break; } updatefats(pmp, bp, bn); } pmp->pm_fmod = 1; return (0); } /* * Check the length of a free cluster chain starting at start. * * pmp - mount point * start - start of chain * count - maximum interesting length */ static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count) { u_long idx, max_idx; u_int map; u_long len; MSDOSFS_ASSERT_MP_LOCKED(pmp); + if (start > pmp->pm_maxcluster) + return (0); max_idx = pmp->pm_maxcluster / N_INUSEBITS; idx = start / N_INUSEBITS; start %= N_INUSEBITS; map = pmp->pm_inusemap[idx]; map &= ~((1 << start) - 1); if (map) { len = ffs(map) - 1 - start; - return (len > count ? count : len); + len = MIN(len, count); + if (start + len > pmp->pm_maxcluster) + len = pmp->pm_maxcluster - start + 1; + return (len); } len = N_INUSEBITS - start; - if (len >= count) - return (count); + if (len >= count) { + len = count; + if (start + len > pmp->pm_maxcluster) + len = pmp->pm_maxcluster - start + 1; + return (len); + } while (++idx <= max_idx) { if (len >= count) break; map = pmp->pm_inusemap[idx]; if (map) { len += ffs(map) - 1; break; } len += N_INUSEBITS; } - return (len > count ? count : len); + len = MIN(len, count); + if (start + len > pmp->pm_maxcluster) + len = pmp->pm_maxcluster - start + 1; + return (len); } /* * Allocate contigous free clusters. * * pmp - mount point. * start - start of cluster chain. * count - number of clusters to allocate. * fillwith - put this value into the fat entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; u_long cl, n; MSDOSFS_ASSERT_MP_LOCKED(pmp); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("chainalloc on ro msdosfs mount")); for (cl = start, n = count; n-- > 0;) usemap_alloc(pmp, cl++); pmp->pm_nxtfree = start + count; if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; pmp->pm_flags |= MSDOSFS_FSIMOD; error = fatchain(pmp, start, count, fillwith); - if (error != 0) + if (error != 0) { + for (cl = start, n = count; n-- > 0;) + usemap_free(pmp, cl++); return (error); + } #ifdef MSDOSFS_DEBUG printf("clusteralloc(): allocated cluster chain at %lu (%lu clusters)\n", start, count); #endif if (retcluster) *retcluster = start; if (got) *got = count; return (0); } /* * Allocate contiguous free clusters. * * pmp - mount point. * start - preferred start of cluster chain. * count - number of clusters requested. * fillwith - put this value into the fat entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ int clusteralloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; MSDOSFS_LOCK_MP(pmp); error = clusteralloc1(pmp, start, count, fillwith, retcluster, got); MSDOSFS_UNLOCK_MP(pmp); return (error); } static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { u_long idx; u_long len, newst, foundl, cn, l; u_long foundcn = 0; /* XXX: foundcn could be used unititialized */ u_int map; MSDOSFS_ASSERT_MP_LOCKED(pmp); #ifdef MSDOSFS_DEBUG printf("clusteralloc(): find %lu clusters\n", count); #endif if (start) { if ((len = chainlength(pmp, start, count)) >= count) return (chainalloc(pmp, start, count, fillwith, retcluster, got)); } else len = 0; newst = pmp->pm_nxtfree; foundl = 0; for (cn = newst; cn <= pmp->pm_maxcluster;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1 << (cn % N_INUSEBITS)) - 1; - if (map != (u_int)-1) { - cn = idx * N_INUSEBITS + ffs(map^(u_int)-1) - 1; + if (map != FULL_RUN) { + cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } for (cn = 0; cn < newst;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1 << (cn % N_INUSEBITS)) - 1; - if (map != (u_int)-1) { - cn = idx * N_INUSEBITS + ffs(map^(u_int)-1) - 1; + if (map != FULL_RUN) { + cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } if (!foundl) return (ENOSPC); if (len) return (chainalloc(pmp, start, len, fillwith, retcluster, got)); else return (chainalloc(pmp, foundcn, foundl, fillwith, retcluster, got)); } /* * Free a chain of clusters. * * pmp - address of the msdosfs mount structure for the filesystem * containing the cluster chain to be freed. * startcluster - number of the 1st cluster in the chain of clusters to be * freed. */ int freeclusterchain(struct msdosfsmount *pmp, u_long cluster) { int error; struct buf *bp = NULL; u_long bn, bo, bsize, byteoffset; u_long readcn, lbn = -1; MSDOSFS_LOCK_MP(pmp); while (cluster >= CLUST_FIRST && cluster <= pmp->pm_maxcluster) { byteoffset = FATOFS(pmp, cluster); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (lbn != bn) { if (bp) updatefats(pmp, bp, lbn); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); MSDOSFS_UNLOCK_MP(pmp); return (error); } lbn = bn; } usemap_free(pmp, cluster); switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(&bp->b_data[bo]); if (cluster & 1) { cluster = readcn >> 4; readcn &= 0x000f; readcn |= MSDOSFSFREE << 4; } else { cluster = readcn; readcn &= 0xf000; readcn |= MSDOSFSFREE & 0xfff; } putushort(&bp->b_data[bo], readcn); break; case FAT16_MASK: cluster = getushort(&bp->b_data[bo]); putushort(&bp->b_data[bo], MSDOSFSFREE); break; case FAT32_MASK: cluster = getulong(&bp->b_data[bo]); putulong(&bp->b_data[bo], (MSDOSFSFREE & FAT32_MASK) | (cluster & ~FAT32_MASK)); break; } cluster &= pmp->pm_fatmask; if ((cluster | ~pmp->pm_fatmask) >= CLUST_RSRVD) cluster |= pmp->pm_fatmask; } if (bp) updatefats(pmp, bp, bn); MSDOSFS_UNLOCK_MP(pmp); return (0); } /* * Read in fat blocks looking for free clusters. For every free cluster * found turn off its corresponding bit in the pm_inusemap. */ int fillinusemap(struct msdosfsmount *pmp) { struct buf *bp = NULL; u_long cn, readcn; int error; u_long bn, bo, bsize, byteoffset; MSDOSFS_ASSERT_MP_LOCKED(pmp); /* * Mark all clusters in use, we mark the free ones in the fat scan * loop further down. */ for (cn = 0; cn < (pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS; cn++) - pmp->pm_inusemap[cn] = (u_int)-1; + pmp->pm_inusemap[cn] = FULL_RUN; /* * Figure how many free clusters are in the filesystem by ripping * through the fat counting the number of entries whose content is * zero. These represent free clusters. */ pmp->pm_freeclustercount = 0; for (cn = CLUST_FIRST; cn <= pmp->pm_maxcluster; cn++) { byteoffset = FATOFS(pmp, cn); bo = byteoffset % pmp->pm_fatblocksize; if (!bo || !bp) { /* Read new FAT block */ if (bp) brelse(bp); fatblock(pmp, byteoffset, &bn, &bsize, NULL); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } } if (FAT32(pmp)) readcn = getulong(&bp->b_data[bo]); else readcn = getushort(&bp->b_data[bo]); if (FAT12(pmp) && (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; - if (readcn == 0) + if (readcn == CLUST_FREE) usemap_free(pmp, cn); } if (bp != NULL) brelse(bp); + + for (cn = pmp->pm_maxcluster + 1; cn < (pmp->pm_maxcluster + + N_INUSEBITS) / N_INUSEBITS; cn++) + pmp->pm_inusemap[cn / N_INUSEBITS] |= 1 << (cn % N_INUSEBITS); + return (0); } /* * Allocate a new cluster and chain it onto the end of the file. * * dep - the file to extend * count - number of clusters to allocate * bpp - where to return the address of the buf header for the first new * file block * ncp - where to put cluster number of the first newly allocated cluster * If this pointer is 0, do not return the cluster number. * flags - see fat.h * * NOTE: This function is not responsible for turning on the DE_UPDATE bit of * the de_flag field of the denode and it does not change the de_FileSize * field. This is left for the caller to do. */ int extendfile(struct denode *dep, u_long count, struct buf **bpp, u_long *ncp, int flags) { int error; u_long frcn; u_long cn, got; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; daddr_t blkno; /* * Don't try to extend the root directory */ if (dep->de_StartCluster == MSDOSFSROOT && (dep->de_Attributes & ATTR_DIRECTORY)) { #ifdef MSDOSFS_DEBUG printf("extendfile(): attempt to extend root directory\n"); #endif return (ENOSPC); } /* * If the "file's last cluster" cache entry is empty, and the file * is not empty, then fill the cache entry by calling pcbmap(). */ if (dep->de_fc[FC_LASTFC].fc_frcn == FCE_EMPTY && dep->de_StartCluster != 0) { error = pcbmap(dep, 0xffff, 0, &cn, 0); /* we expect it to return E2BIG */ if (error != E2BIG) return (error); } dep->de_fc[FC_NEXTTOLASTFC].fc_frcn = dep->de_fc[FC_LASTFC].fc_frcn; dep->de_fc[FC_NEXTTOLASTFC].fc_fsrcn = dep->de_fc[FC_LASTFC].fc_fsrcn; while (count > 0) { /* * Allocate a new cluster chain and cat onto the end of the - * file. * If the file is empty we make de_StartCluster point - * to the new block. Note that de_StartCluster being 0 is - * sufficient to be sure the file is empty since we exclude - * attempts to extend the root directory above, and the root - * dir is the only file with a startcluster of 0 that has - * blocks allocated (sort of). + * file. + * If the file is empty we make de_StartCluster point + * to the new block. Note that de_StartCluster being + * 0 is sufficient to be sure the file is empty since + * we exclude attempts to extend the root directory + * above, and the root dir is the only file with a + * startcluster of 0 that has blocks allocated (sort + * of). */ if (dep->de_StartCluster == 0) cn = 0; else cn = dep->de_fc[FC_LASTFC].fc_fsrcn + 1; error = clusteralloc(pmp, cn, count, CLUST_EOFE, &cn, &got); if (error) return (error); count -= got; /* * Give them the filesystem relative cluster number if they want * it. */ if (ncp) { *ncp = cn; ncp = NULL; } if (dep->de_StartCluster == 0) { dep->de_StartCluster = cn; frcn = 0; } else { error = fatentry(FAT_SET, pmp, dep->de_fc[FC_LASTFC].fc_fsrcn, 0, cn); if (error) { clusterfree(pmp, cn, NULL); return (error); } frcn = dep->de_fc[FC_LASTFC].fc_frcn + 1; } /* * Update the "last cluster of the file" entry in the denode's fat * cache. */ fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); if (flags & DE_CLEAR) { while (got-- > 0) { /* * Get the buf header for the new block of the file. */ if (dep->de_Attributes & ATTR_DIRECTORY) bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), pmp->pm_bpcluster, 0, 0, 0); else { bp = getblk(DETOV(dep), frcn++, pmp->pm_bpcluster, 0, 0, 0); /* * Do the bmap now, as in msdosfs_write */ if (pcbmap(dep, bp->b_lblkno, &blkno, 0, 0)) bp->b_blkno = -1; if (bp->b_blkno == -1) panic("extendfile: pcbmap"); else bp->b_blkno = blkno; } vfs_bio_clrbuf(bp); if (bpp) { *bpp = bp; bpp = NULL; } else bdwrite(bp); } } } return (0); } /*- * Routine to mark a FAT16 or FAT32 volume as "clean" or "dirty" by * manipulating the upper bit of the FAT entry for cluster 1. Note that * this bit is not defined for FAT12 volumes, which are always assumed to * be clean. * * The fatentry() routine only works on cluster numbers that a file could * occupy, so it won't manipulate the entry for cluster 1. So we have to do * it here. The code was stolen from fatentry() and tailored for cluster 1. * * Inputs: * pmp The MS-DOS volume to mark * dirty Non-zero if the volume should be marked dirty; zero if it * should be marked clean * * Result: * 0 Success * EROFS Volume is read-only * ? (other errors from called routines) */ int markvoldirty(struct msdosfsmount *pmp, int dirty) { struct buf *bp; u_long bn, bo, bsize, byteoffset, fatval; int error; /* * FAT12 does not support a "clean" bit, so don't do anything for * FAT12. */ if (FAT12(pmp)) return (0); /* Can't change the bit on a read-only filesystem. */ if (pmp->pm_flags & MSDOSFSMNT_RONLY) return (EROFS); /* * Fetch the block containing the FAT entry. It is given by the * pseudo-cluster 1. */ byteoffset = FATOFS(pmp, 1); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } /* * Get the current value of the FAT entry and set/clear the relevant * bit. Dirty means clear the "clean" bit; clean means set the * "clean" bit. */ if (FAT32(pmp)) { /* FAT32 uses bit 27. */ fatval = getulong(&bp->b_data[bo]); if (dirty) fatval &= 0xF7FFFFFF; else fatval |= 0x08000000; putulong(&bp->b_data[bo], fatval); } else { /* Must be FAT16; use bit 15. */ fatval = getushort(&bp->b_data[bo]); if (dirty) fatval &= 0x7FFF; else fatval |= 0x8000; putushort(&bp->b_data[bo], fatval); } /* Write out the modified FAT block synchronously. */ return (bwrite(bp)); } Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c (revision 308054) @@ -1,1015 +1,1014 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char msdosfs_lock_msg[] = "fatlk"; /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "large", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1) pmp->pm_gid = v; if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1) pmp->pm_uid = v; if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1) pmp->pm_mask = v & ALLPERMS; if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else pmp->pm_flags |= MSDOSFSMNT_LONGNAME; return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct msdosfs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp) { struct vnode *devvp; /* vnode for blk device to mount */ struct thread *td; /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; accmode_t accmode; char *from; td = curthread; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) { /* * Forbid export requests if filesystem has * MSDOSFS_LARGEFS flag set. */ if ((pmp->pm_flags & MSDOSFS_LARGEFS) != 0) { vfs_mount_error(mp, "MSDOSFS_LARGEFS flag set, cannot export"); return (EOPNOTSUPP); } } if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); /* Now that the volume is modifiable, mark it dirty. */ error = markvoldirty(pmp, 1); if (error) return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vput(devvp); if (devvp != pmp->pm_devvp) return (EINVAL); /* XXX needs translation */ } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE); return error; } vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; u_int8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp, 0); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp, 0); return (error); } dev_ref(dev); VOP_UNLOCK(devvp, 0); bo = &devvp->v_bufobj; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Experimental support for large MS-DOS filesystems. * WARNING: This uses at least 32 bytes of kernel memory (which is not * reclaimed until the FS is unmounted) for each file on disk to map * between the 32-bit inode numbers used by VFS and the 64-bit * pseudo-inode numbers used internally by msdosfs. This is only * safe to use in certain controlled situations (e.g. read-only FS * with less than 1 million files). * Since the mappings do not persist across unmounts (or reboots), these * filesystems are not suitable for exporting through NFS, or any other * application that requires fixed inode numbers. */ vfs_flagopt(mp->mnt_optnew, "large", &pmp->pm_flags, MSDOSFS_LARGEFS); /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* * We don't check pm_Heads nor pm_SecPerTrack, because * these may not be set for EFI file systems. We don't * use these anyway, so we're unaffected if they are * invalid. */ if (!pmp->pm_BytesPerSec || !SecPerClust) { error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (!(pmp->pm_flags & MSDOSFS_LARGEFS)) { if (pmp->pm_HugeSectors > 0xffffffff / (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) { /* * We cannot deal currently with this size of disk * due to fileid limitations (see msdosfs_getattr and * msdosfs_readdir) */ error = EINVAL; vfs_mount_error(mp, "Disk too big, try '-o large' mount option"); goto error_exit; } } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): bad FAT32 filesystem\n"); #endif goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts * sizeof(struct direntry), DEV_BSIZE); /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one fat entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); #endif pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; pmp->pm_dev = dev; /* * Have the inuse map filled in. */ MSDOSFS_LOCK_MP(pmp); error = fillinusemap(pmp); MSDOSFS_UNLOCK_MP(pmp); if (error != 0) goto error_exit; /* * If they want fat updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the fat being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { if ((error = markvoldirty(pmp, 1)) != 0) { (void)markvoldirty(pmp, 0); goto error_exit; } pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; - mp->mnt_kern_flag |= MNTK_USES_BCACHE; + mp->mnt_kern_flag |= MNTK_USES_BCACHE | MNTK_NO_IOPF; MNT_IUNLOCK(mp); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_init(mp); return 0; error_exit: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (pmp) { lockdestroy(&pmp->pm_fatlock); - if (pmp->pm_inusemap) - free(pmp->pm_inusemap, M_MSDOSFSFAT); + free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; error = flags = 0; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) error = msdosfs_sync(mp, MNT_WAIT); if ((mntflags & MNT_FORCE) != 0) flags |= FORCECLOSE; else if (error != 0) return (error); error = vflush(mp, 0, flags, curthread); if (error != 0 && error != ENXIO) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error && error != ENXIO) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_actfreelist), vp->v_actfreelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); BO_UNLOCK(bo); } #endif g_topology_lock(); g_vfs_close(pmp->pm_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0); vrele(pmp->pm_devvp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); if (pmp->pm_flags & MSDOSFS_LARGEFS) msdosfs_fileno_free(mp); lockdestroy(&pmp->pm_fatlock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } /* * If we have an FSInfo block, update it. */ static int msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor) { struct fsinfo *fp; struct buf *bp; int error; MSDOSFS_LOCK_MP(pmp); if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto unlock; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { brelse(bp); goto unlock; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); unlock: MSDOSFS_UNLOCK_MP(pmp); return (error); } static int msdosfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *nvp; struct thread *td; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; td = curthread; /* * If we ever switch to not updating all of the fats all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update fats here */ } } /* * Write back each (modified) denode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0); vrele(vp); } /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0); } error = msdosfs_fsiflush(pmp, waitfor); if (error != 0) allerror = error; return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vnops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vnops.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vnops.c (revision 308054) @@ -1,1920 +1,1962 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #define DOS_FILESIZE_MAX 0xffffffff /* * Prototypes for MSDOSFS vnode operations */ static vop_create_t msdosfs_create; static vop_mknod_t msdosfs_mknod; static vop_open_t msdosfs_open; static vop_close_t msdosfs_close; static vop_access_t msdosfs_access; static vop_getattr_t msdosfs_getattr; static vop_setattr_t msdosfs_setattr; static vop_read_t msdosfs_read; static vop_write_t msdosfs_write; static vop_fsync_t msdosfs_fsync; static vop_remove_t msdosfs_remove; static vop_link_t msdosfs_link; static vop_rename_t msdosfs_rename; static vop_mkdir_t msdosfs_mkdir; static vop_rmdir_t msdosfs_rmdir; static vop_symlink_t msdosfs_symlink; static vop_readdir_t msdosfs_readdir; static vop_bmap_t msdosfs_bmap; +static vop_getpages_t msdosfs_getpages; static vop_strategy_t msdosfs_strategy; static vop_print_t msdosfs_print; static vop_pathconf_t msdosfs_pathconf; static vop_vptofh_t msdosfs_vptofh; /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retrieve the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. We must also free * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or * only if the SAVESTART bit in cn_flags is clear on success. */ static int msdosfs_create(struct vop_create_args *ap) { struct componentname *cnp = ap->a_cnp; struct denode ndirent; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct timespec ts; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap); #endif /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_create: no name"); #endif bzero(&ndirent, sizeof(ndirent)); error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_ARCHIVE; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (0); bad: return (error); } static int msdosfs_mknod(struct vop_mknod_args *ap) { return (EINVAL); } static int msdosfs_open(struct vop_open_args *ap) { struct denode *dep = VTODE(ap->a_vp); vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td); return 0; } static int msdosfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct timespec ts; VI_LOCK(vp); if (vp->v_usecount > 1) { getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); } VI_UNLOCK(vp); return 0; } static int msdosfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; mode_t file_mode; accmode_t accmode = ap->a_accmode; file_mode = S_IRWXU|S_IRWXG|S_IRWXO; file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); /* * Disallow writing to directories and regular files if the * filesystem is read-only. */ if (accmode & VWRITE) { switch (vp->v_type) { case VREG: case VDIR: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid, ap->a_accmode, ap->a_cred, NULL)); } static int msdosfs_getattr(struct vop_getattr_args *ap) { struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; mode_t mode; struct timespec ts; u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); uint64_t fileid; getnanotime(&ts); DETIMES(dep, &ts, &ts, &ts); vap->va_fsid = dev2udev(pmp->pm_dev); /* * The following computation of the fileid must be the same as that * used in msdosfs_readdir() to compute d_fileno. If not, pwd * doesn't work. */ if (dep->de_Attributes & ATTR_DIRECTORY) { fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) * dirsperblk; if (dep->de_StartCluster == MSDOSFSROOT) fileid = 1; } else { fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) * dirsperblk; if (dep->de_dirclust == MSDOSFSROOT) fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk; fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry); } if (pmp->pm_flags & MSDOSFS_LARGEFS) vap->va_fileid = msdosfs_fileno_map(pmp->pm_mountp, fileid); else vap->va_fileid = (long)fileid; mode = S_IRWXU|S_IRWXG|S_IRWXO; vap->va_mode = mode & (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask); vap->va_uid = pmp->pm_uid; vap->va_gid = pmp->pm_gid; vap->va_nlink = 1; vap->va_rdev = NODEV; vap->va_size = dep->de_FileSize; fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime); vap->va_ctime = vap->va_mtime; if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) { fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime); fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun, 0, &vap->va_birthtime); } else { vap->va_atime = vap->va_mtime; vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; } vap->va_flags = 0; if (dep->de_Attributes & ATTR_ARCHIVE) vap->va_flags |= UF_ARCHIVE; if (dep->de_Attributes & ATTR_HIDDEN) vap->va_flags |= UF_HIDDEN; if (dep->de_Attributes & ATTR_READONLY) vap->va_flags |= UF_READONLY; if (dep->de_Attributes & ATTR_SYSTEM) vap->va_flags |= UF_SYSTEM; vap->va_gen = 0; vap->va_blocksize = pmp->pm_bpcluster; vap->va_bytes = (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask; vap->va_type = ap->a_vp->v_type; vap->va_filerev = dep->de_modrev; return (0); } static int msdosfs_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): vp %p, vap %p, cred %p\n", ap->a_vp, vap, cred); #endif /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { #ifdef MSDOSFS_DEBUG printf("msdosfs_setattr(): returning EINVAL\n"); printf(" va_type %d, va_nlink %x, va_fsid %lx, va_fileid %lx\n", vap->va_type, vap->va_nlink, vap->va_fsid, vap->va_fileid); printf(" va_blocksize %lx, va_rdev %x, va_bytes %qx, va_gen %lx\n", vap->va_blocksize, vap->va_rdev, vap->va_bytes, vap->va_gen); printf(" va_uid %x, va_gid %x\n", vap->va_uid, vap->va_gid); #endif return (EINVAL); } /* * We don't allow setting attributes on the root directory. * The special case for the root directory is because before * FAT32, the root directory didn't have an entry for itself * (and was otherwise special). With FAT32, the root * directory is not so special, but still doesn't have an * entry for itself. */ if (vp->v_vflag & VV_ROOT) return (EINVAL); if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (error) return (error); } /* * We are very inconsistent about handling unsupported * attributes. We ignored the access time and the * read and execute bits. We were strict for the other * attributes. */ if (vap->va_flags & ~(UF_ARCHIVE | UF_HIDDEN | UF_READONLY | UF_SYSTEM)) return EOPNOTSUPP; if (vap->va_flags & UF_ARCHIVE) dep->de_Attributes |= ATTR_ARCHIVE; else dep->de_Attributes &= ~ATTR_ARCHIVE; if (vap->va_flags & UF_HIDDEN) dep->de_Attributes |= ATTR_HIDDEN; else dep->de_Attributes &= ~ATTR_HIDDEN; /* We don't allow changing the readonly bit on directories. */ if (vp->v_type != VDIR) { if (vap->va_flags & UF_READONLY) dep->de_Attributes |= ATTR_READONLY; else dep->de_Attributes &= ~ATTR_READONLY; } if (vap->va_flags & UF_SYSTEM) dep->de_Attributes |= ATTR_SYSTEM; else dep->de_Attributes &= ~ATTR_SYSTEM; dep->de_flag |= DE_MODIFIED; } if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { uid_t uid; gid_t gid; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); uid = vap->va_uid; if (uid == (uid_t)VNOVAL) uid = pmp->pm_uid; gid = vap->va_gid; if (gid == (gid_t)VNOVAL) gid = pmp->pm_gid; if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid || (gid != pmp->pm_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0); if (error) return (error); } if (uid != pmp->pm_uid || gid != pmp->pm_gid) return EINVAL; } if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VREG: /* * Truncation is only supported for regular files, * Disallow it if the filesystem is read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support any file types except regular * files and directories in this file system, so * this (default) case is unreachable and can do * anything. Keep falling through to detrunc() * for now. */ break; } error = detrunc(dep, vap->va_size, 0, cred); if (error) return error; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 && vap->va_atime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_ACCESS; timespec2fattime(&vap->va_atime, 0, &dep->de_ADate, NULL, NULL); } if (vap->va_mtime.tv_sec != VNOVAL) { dep->de_flag &= ~DE_UPDATE; timespec2fattime(&vap->va_mtime, 0, &dep->de_MDate, &dep->de_MTime, NULL); } /* * We don't set the archive bit when modifying the time of * a directory to emulate the Windows/DOS behavior. */ if (vp->v_type != VDIR) dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } /* * DOS files only have the ability to have their writability * attribute set, so we use the owner write bit to set the readonly * attribute. */ if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != pmp->pm_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (error) return (error); } if (vp->v_type != VDIR) { /* We ignore the read and execute bits. */ if (vap->va_mode & VWRITE) dep->de_Attributes &= ~ATTR_READONLY; else dep->de_Attributes |= ATTR_READONLY; dep->de_Attributes |= ATTR_ARCHIVE; dep->de_flag |= DE_MODIFIED; } } return (deupdat(dep, 0)); } static int msdosfs_read(struct vop_read_args *ap) { int error = 0; int blsize; int isadir; ssize_t orig_resid; u_int n; u_long diff; u_long on; daddr_t lbn; daddr_t rablock; int rasize; int seqcount; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; /* * If they didn't ask for any data, then we are done. */ orig_resid = uio->uio_resid; if (orig_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. * We don't need to check for large offsets as in ffs because * dep->de_FileSize <= DOS_FILESIZE_MAX < OFF_MAX, so large * offsets cannot cause overflow even in theory. */ seqcount = ap->a_ioflag >> IO_SEQSHIFT; isadir = dep->de_Attributes & ATTR_DIRECTORY; do { if (uio->uio_offset >= dep->de_FileSize) break; lbn = de_cluster(pmp, uio->uio_offset); rablock = lbn + 1; blsize = pmp->pm_bpcluster; on = uio->uio_offset & pmp->pm_crbomask; /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ if (isadir) { /* convert cluster # to block # */ error = pcbmap(dep, lbn, &lbn, 0, &blsize); if (error == E2BIG) { error = EINVAL; break; } else if (error) break; error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); } else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) { error = bread(vp, lbn, blsize, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, dep->de_FileSize, lbn, blsize, NOCRED, on + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { rasize = blsize; error = breadn(vp, lbn, blsize, &rablock, &rasize, 1, NOCRED, &bp); } else { error = bread(vp, lbn, blsize, NOCRED, &bp); } if (error) { brelse(bp); break; } diff = pmp->pm_bpcluster - on; n = diff > uio->uio_resid ? uio->uio_resid : diff; diff = dep->de_FileSize - uio->uio_offset; if (diff < n) n = diff; diff = blsize - bp->b_resid; if (diff < n) n = diff; - error = uiomove(bp->b_data + on, (int) n, uio); + error = vn_io_fault_uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) dep->de_flag |= DE_ACCESS; return (error); } /* * Write data to a file or directory. */ static int msdosfs_write(struct vop_write_args *ap) { int n; int croffset; ssize_t resid; u_long osize; int error = 0; u_long count; int seqcount; daddr_t bn, lastcn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; struct vnode *vp = ap->a_vp; struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; #ifdef MSDOSFS_DEBUG printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n", vp, uio, ioflag, cred); printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n", dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; thisvp = vp; break; case VDIR: return EISDIR; default: panic("msdosfs_write(): bad file type"); } /* * This is needed (unlike in ffs_write()) because we extend the * file outside of the loop but we don't want to extend the file * for writes of 0 bytes. */ if (uio->uio_resid == 0) return (0); /* * The caller is supposed to ensure that * uio->uio_offset >= 0 and uio->uio_resid >= 0. */ if ((uoff_t)uio->uio_offset + uio->uio_resid > DOS_FILESIZE_MAX) return (EFBIG); /* * If they've exceeded their filesize limit, tell them about it. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); /* * If the offset we are starting the write at is beyond the end of * the file, then they've done a seek. Unix filesystems allow * files with holes in them, DOS doesn't so we must fill the hole * with zeroed blocks. */ if (uio->uio_offset > dep->de_FileSize) { error = deextend(dep, uio->uio_offset, cred); if (error) return (error); } /* * Remember some values in case the write fails. */ resid = uio->uio_resid; osize = dep->de_FileSize; /* * If we write beyond the end of the file, extend it to its ultimate * size ahead of the time to hopefully get a contiguous area. */ if (uio->uio_offset + resid > osize) { count = de_clcount(pmp, uio->uio_offset + resid) - de_clcount(pmp, osize); error = extendfile(dep, count, NULL, NULL, 0); if (error && (error != ENOSPC || (ioflag & IO_UNIT))) goto errexit; lastcn = dep->de_fc[FC_LASTFC].fc_frcn; } else lastcn = de_clcount(pmp, osize) - 1; seqcount = ioflag >> IO_SEQSHIFT; do { if (de_cluster(pmp, uio->uio_offset) > lastcn) { error = ENOSPC; break; } croffset = uio->uio_offset & pmp->pm_crbomask; n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; /* The object size needs to be set before buffer is allocated */ vnode_pager_setsize(vp, dep->de_FileSize); } bn = de_cluster(pmp, uio->uio_offset); if ((uio->uio_offset & pmp->pm_crbomask) == 0 && (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > de_cluster(pmp, uio->uio_offset) || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0); + /* + * This call to vfs_bio_clrbuf() ensures that + * even if vn_io_fault_uiomove() below faults, + * garbage from the newly instantiated buffer + * is not exposed to the userspace via mmap(). + */ vfs_bio_clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the fat table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0); if (error) bp->b_blkno = -1; else bp->b_blkno = bn; } if (bp->b_blkno == -1) { brelse(bp); if (!error) error = EIO; /* XXX */ break; } } else { /* * The block we need to write into exists, so read it in. */ error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp); if (error) { brelse(bp); break; } } /* * Should these vnode_pager_* functions be done on dir * files? */ /* * Copy the data from user space into the buf header. */ - error = uiomove(bp->b_data + croffset, n, uio); + error = vn_io_fault_uiomove(bp->b_data + croffset, n, uio); if (error) { brelse(bp); break; } /* Prepare for clustered writes in some else clauses. */ if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) bp->b_flags |= B_CLUSTEROK; /* * If IO_SYNC, then each buffer is written synchronously. * Otherwise, if we have a severe page deficiency then * write the buffer asynchronously. Otherwise, if on a * cluster boundary then write the buffer asynchronously, * combining it with contiguous clusters if permitted and * possible, since we don't expect more writes into this * buffer soon. Otherwise, do a delayed write because we * expect more writes into this buffer soon. */ if (ioflag & IO_SYNC) (void)bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) bawrite(bp); else if (n + croffset == pmp->pm_bpcluster) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) cluster_write(vp, bp, dep->de_FileSize, seqcount, 0); else bawrite(bp); } else bdwrite(bp); dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); /* * If the write failed and they want us to, truncate the file back * to the size it was before the write was attempted. */ errexit: if (error) { if (ioflag & IO_UNIT) { detrunc(dep, osize, ioflag & IO_SYNC, NOCRED); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED); if (uio->uio_resid != resid) error = 0; } } else if (ioflag & IO_SYNC) error = deupdat(dep, 1); return (error); } /* * Flush the blocks of a file to disk. */ static int msdosfs_fsync(struct vop_fsync_args *ap) { struct vnode *devvp; int allerror, error; vop_stdfsync(ap); /* * If the syncing request comes from fsync(2), sync the entire * FAT and any other metadata that happens to be on devvp. We * need this mainly for the FAT. We write the FAT sloppily, and * syncing it all now is the best we can easily do to get all * directory entries associated with the file (not just the file) * fully synced. The other metadata includes critical metadata * for all directory entries, but only in the MNT_ASYNC case. We * will soon sync all metadata in the file's directory entry. * Non-critical metadata for associated directory entries only * gets synced accidentally, as in most file systems. */ if (ap->a_waitfor == MNT_WAIT) { devvp = VTODE(ap->a_vp)->de_pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); allerror = VOP_FSYNC(devvp, MNT_WAIT, ap->a_td); VOP_UNLOCK(devvp, 0); } else allerror = 0; error = deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT); if (allerror == 0) allerror = error; return (allerror); } static int msdosfs_remove(struct vop_remove_args *ap) { struct denode *dep = VTODE(ap->a_vp); struct denode *ddep = VTODE(ap->a_dvp); int error; if (ap->a_vp->v_type == VDIR) error = EPERM; else error = removede(ddep, dep); #ifdef MSDOSFS_DEBUG printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount); #endif return (error); } /* * DOS filesystems don't know what links are. */ static int msdosfs_link(struct vop_link_args *ap) { return (EOPNOTSUPP); } /* * Renames on files require moving the denode to a new hash queue since the * denode's location is used to compute which hash queue to put the file * in. Unless it is a rename in place. For example "mv a b". * * What follows is the basic algorithm: * * if (file move) { * if (dest file exists) { * remove dest file * } * if (dest and src in same directory) { * rewrite name in existing directory slot * } else { * write new entry in dest directory * update offset and dirclust in denode * move denode to new hash chain * clear old directory entry * } * } else { * directory move * if (dest directory exists) { * if (dest is not empty) { * return ENOTEMPTY * } * remove dest directory * } * if (dest and src in same directory) { * rewrite name in existing entry * } else { * be sure dest is not a child of src directory * write entry in dest directory * update "." and ".." in moved directory * clear old directory entry for moved directory * } * } * * On entry: * source's parent directory is unlocked * source file or directory is unlocked * destination's parent directory is locked * destination file or directory is locked if it exists * * On exit: * all denodes should be released */ static int msdosfs_rename(struct vop_rename_args *ap) { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct denode *ip, *xp, *dp, *zp; u_char toname[12], oldname[11]; u_long from_diroffset, to_diroffset; u_char to_count; int doingdirectory = 0, newparent = 0; int error; u_long cn, pcl; daddr_t bn; struct msdosfsmount *pmp; struct direntry *dotdotp; struct buf *bp; pmp = VFSTOMSDOSFS(fdvp->v_mount); #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("msdosfs_rename: no name"); #endif /* * Check for cross-device rename. */ if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } /* * If source and dest are the same, do nothing. */ if (tvp == fvp) { error = 0; goto abortit; } error = vn_lock(fvp, LK_EXCLUSIVE); if (error) goto abortit; dp = VTODE(fdvp); ip = VTODE(fvp); /* * Be sure we are not renaming ".", "..", or an alias of ".". This * leads to a crippled directory tree. It's pretty tough to do a * "ls" or "pwd" with the "." directory entry missing, and "cd .." * doesn't work if the ".." entry is missing. */ if (ip->de_Attributes & ATTR_DIRECTORY) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) || (ip->de_flag & DE_RENAME)) { VOP_UNLOCK(fvp, 0); error = EINVAL; goto abortit; } ip->de_flag |= DE_RENAME; doingdirectory++; } /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; /* * Remember direntry place to use for destination */ to_diroffset = dp->de_fndoffset; to_count = dp->de_fndcnt; /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to doscheckpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0); if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster) newparent = 1; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); /* * doscheckpath() vput()'s dp, * so we have to do a relookup afterwards */ error = doscheckpath(ip, dp); if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost to startdir"); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; dp = VTODE(tdvp); xp = tvp ? VTODE(tvp) : NULL; } if (xp != NULL) { /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if (xp->de_Attributes & ATTR_DIRECTORY) { if (!dosdirempty(xp)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = removede(dp, xp); if (error) goto bad; vput(tvp); xp = NULL; } /* * Convert the filename in tcnp into a dos filename. We copy this * into the denode and directory entry for the destination * file/directory. */ error = uniqdosname(VTODE(tdvp), tcnp, toname); if (error) goto abortit; /* * Since from wasn't locked at various places above, * have to do a relookup here. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("msdosfs_rename: lost from startdir"); if (!newparent) VOP_UNLOCK(tdvp, 0); if (relookup(fdvp, &fvp, fcnp) == 0) vrele(fdvp); if (fvp == NULL) { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); if (newparent) VOP_UNLOCK(tdvp, 0); vrele(tdvp); vrele(ap->a_fvp); return 0; } xp = VTODE(fvp); zp = VTODE(fdvp); from_diroffset = zp->de_fndoffset; /* * Ensure that the directory entry still exists and has not * changed till now. If the source is a file the entry may * have been unlinked or renamed. In either case there is * no further work to be done. If the source is a directory * then it cannot have been rmdir'ed or renamed; this is * prohibited by the DE_RENAME flag. */ if (xp != ip) { if (doingdirectory) panic("rename: lost dir entry"); VOP_UNLOCK(fvp, 0); if (newparent) VOP_UNLOCK(fdvp, 0); vrele(ap->a_fvp); xp = NULL; } else { vrele(fvp); xp = NULL; /* * First write a new entry in the destination * directory and mark the entry in the source directory * as deleted. Then move the denode to the correct hash * chain for its new location in the filesystem. And, if * we moved a directory, then update its .. entry to point * to the new parent directory. */ bcopy(ip->de_Name, oldname, 11); bcopy(toname, ip->de_Name, 11); /* update denode */ dp->de_fndoffset = to_diroffset; dp->de_fndcnt = to_count; error = createde(ip, dp, (struct denode **)0, tcnp); if (error) { bcopy(oldname, ip->de_Name, 11); if (newparent) VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(fvp, 0); goto bad; } /* * If ip is for a directory, then its name should always * be "." since it is for the directory entry in the * directory itself (msdosfs_lookup() always translates * to the "." entry so as to get a unique denode, except * for the root directory there are different * complications). However, we just corrupted its name * to pass the correct name to createde(). Undo this. */ if ((ip->de_Attributes & ATTR_DIRECTORY) != 0) bcopy(oldname, ip->de_Name, 11); ip->de_refcnt++; zp->de_fndoffset = from_diroffset; error = removede(zp, ip); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(fvp, 0); goto bad; } if (!doingdirectory) { error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0, &ip->de_dirclust, 0); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ if (newparent) VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(fvp, 0); goto bad; } if (ip->de_dirclust == MSDOSFSROOT) ip->de_diroffset = to_diroffset; else ip->de_diroffset = to_diroffset & pmp->pm_crbomask; } reinsert(ip); if (newparent) VOP_UNLOCK(fdvp, 0); } /* * If we moved a directory to a new parent directory, then we must * fixup the ".." entry in the moved directory. */ if (doingdirectory && newparent) { cn = ip->de_StartCluster; if (cn == MSDOSFSROOT) { /* this should never happen */ panic("msdosfs_rename(): updating .. in root directory?"); } else bn = cntobn(pmp, cn); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); if (error) { /* XXX should downgrade to ro here, fs is corrupt */ brelse(bp); VOP_UNLOCK(fvp, 0); goto bad; } dotdotp = (struct direntry *)bp->b_data + 1; pcl = dp->de_StartCluster; if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = MSDOSFSROOT; putushort(dotdotp->deStartCluster, pcl); if (FAT32(pmp)) putushort(dotdotp->deHighClust, pcl >> 16); if (DOINGASYNC(fvp)) bdwrite(bp); else if ((error = bwrite(bp)) != 0) { /* XXX should downgrade to ro here, fs is corrupt */ VOP_UNLOCK(fvp, 0); goto bad; } } /* * The msdosfs lookup is case insensitive. Several aliases may * be inserted for a single directory entry. As a consequnce, * name cache purge done by lookup for fvp when DELETE op for * namei is specified, might be not enough to expunge all * namecache entries that were installed for this direntry. */ cache_purge(fvp); VOP_UNLOCK(fvp, 0); bad: if (xp) vput(tvp); vput(tdvp); out: ip->de_flag &= ~DE_RENAME; vrele(fdvp); vrele(fvp); return (error); } static struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; static int msdosfs_mkdir(struct vop_mkdir_args *ap) { struct componentname *cnp = ap->a_cnp; struct denode *dep; struct denode *pdep = VTODE(ap->a_dvp); struct direntry *denp; struct msdosfsmount *pmp = pdep->de_pmp; struct buf *bp; u_long newcluster, pcl; int bn; int error; struct denode ndirent; struct timespec ts; /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; bzero(&ndirent, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; getnanotime(&ts); DETIMES(&ndirent, &ts, &ts, &ts); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0); bzero(bp->b_data, pmp->pm_bpcluster); bcopy(&dosdirtemplate, bp->b_data, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; /* * Although the root directory has a non-magic starting cluster * number for FAT32, chkdsk and fsck_msdosfs still require * references to it in dotdot entries to be magic. */ if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = MSDOSFSROOT; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pcl >> 16); } if (DOINGASYNC(ap->a_dvp)) bdwrite(bp); else if ((error = bwrite(bp)) != 0) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("msdosfs_mkdir: no name"); #endif error = uniqdosname(pdep, cnp, ndirent.de_Name); if (error) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_LowerCase = 0; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; error = createde(&ndirent, pdep, &dep, cnp); if (error) goto bad; *ap->a_vpp = DETOV(dep); return (0); bad: clusterfree(pmp, newcluster, NULL); bad2: return (error); } static int msdosfs_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct denode *ip, *dp; int error; ip = VTODE(vp); dp = VTODE(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) { error = ENOTEMPTY; goto out; } /* * Delete the entry from the directory. For dos filesystems this * gets rid of the directory entry on disk, the in memory copy * still exists but the de_refcnt is <= 0. This prevents it from * being found by deget(). When the vput() on dep is done we give * up access and eventually msdosfs_reclaim() will be called which * will remove it from the denode cache. */ error = removede(dp, ip); if (error) goto out; /* * This is where we decrement the link count in the parent * directory. Since dos filesystems don't do this we just purge * the name cache. */ cache_purge(dvp); /* * Truncate the directory that is being deleted. */ error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred); cache_purge(vp); out: return (error); } /* * DOS filesystems don't know what symlinks are. */ static int msdosfs_symlink(struct vop_symlink_args *ap) { return (EOPNOTSUPP); } static int msdosfs_readdir(struct vop_readdir_args *ap) { struct mbnambuf nb; int error = 0; int diff; long n; int blsize; long on; u_long cn; uint64_t fileno; u_long dirsperblk; long bias = 0; daddr_t bn, lbn; struct buf *bp; struct denode *dep = VTODE(ap->a_vp); struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; struct dirent dirbuf; struct uio *uio = ap->a_uio; u_long *cookies = NULL; int ncookies = 0; off_t offset, off; int chksum = -1; #ifdef MSDOSFS_DEBUG printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n", ap->a_vp, uio, ap->a_cred, ap->a_eofflag); #endif /* * msdosfs_readdir() won't operate properly on regular files since * it does i/o only with the filesystem vnode, and hence can * retrieve the wrong block from the buffer cache for a plain file. * So, fail attempts to readdir() on a plain file. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) return (ENOTDIR); /* * To be safe, initialize dirbuf */ bzero(dirbuf.d_name, sizeof(dirbuf.d_name)); /* * If the user buffer is smaller than the size of one dos directory * entry or the file offset is not a multiple of the size of a * directory entry, then we fail the read. */ off = offset = uio->uio_offset; if (uio->uio_resid < sizeof(struct direntry) || (offset & (sizeof(struct direntry) - 1))) return (EINVAL); if (ap->a_ncookies) { ncookies = uio->uio_resid / 16; cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; *ap->a_ncookies = ncookies; } dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry); /* * If they are reading from the root directory then, we simulate * the . and .. entries since these don't exist in the root * directory. We also set the offset bias to make up for having to * simulate these entries. By this I mean that at file offset 64 we * read the first entry in the root directory that lives on disk. */ if (dep->de_StartCluster == MSDOSFSROOT || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) { #if 0 printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n", offset); #endif bias = 2 * sizeof(struct direntry); if (offset < bias) { for (n = (int)offset / sizeof(struct direntry); n < 2; n++) { if (FAT32(pmp)) fileno = (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; if (pmp->pm_flags & MSDOSFS_LARGEFS) { dirbuf.d_fileno = msdosfs_fileno_map(pmp->pm_mountp, fileno); } else { dirbuf.d_fileno = (uint32_t)fileno; } dirbuf.d_type = DT_DIR; switch (n) { case 0: dirbuf.d_namlen = 1; strcpy(dirbuf.d_name, "."); break; case 1: dirbuf.d_namlen = 2; strcpy(dirbuf.d_name, ".."); break; } dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) goto out; error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) goto out; offset += sizeof(struct direntry); off = offset; if (cookies) { *cookies++ = offset; if (--ncookies <= 0) goto out; } } } } mbnambuf_init(&nb); off = offset; while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; n = min(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; n = min(n, diff); error = pcbmap(dep, lbn, &bn, &cn, &blsize); if (error) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } n = min(n, blsize - bp->b_resid); if (n == 0) { brelse(bp); return (EIO); } /* * Convert from dos directory entries to fs-independent * directory entries. */ for (dentp = (struct direntry *)(bp->b_data + on); (char *)dentp < bp->b_data + on + n; dentp++, offset += sizeof(struct direntry)) { #if 0 printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n", dentp, prev, crnt, dentp->deName[0], dentp->deAttributes); #endif /* * If this is an unused entry, we can stop. */ if (dentp->deName[0] == SLOT_EMPTY) { brelse(bp); goto out; } /* * Skip deleted entries. */ if (dentp->deName[0] == SLOT_DELETED) { chksum = -1; mbnambuf_init(&nb); continue; } /* * Handle Win95 long directory entries */ if (dentp->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = win2unixfn(&nb, (struct winentry *)dentp, chksum, pmp); continue; } /* * Skip volume labels */ if (dentp->deAttributes & ATTR_VOLUME) { chksum = -1; mbnambuf_init(&nb); continue; } /* * This computation of d_fileno must match * the computation of va_fileid in * msdosfs_getattr. */ if (dentp->deAttributes & ATTR_DIRECTORY) { fileno = getushort(dentp->deStartCluster); if (FAT32(pmp)) fileno |= getushort(dentp->deHighClust) << 16; /* if this is the root directory */ if (fileno == MSDOSFSROOT) if (FAT32(pmp)) fileno = (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) * dirsperblk; else fileno = 1; else fileno = (uint64_t)cntobn(pmp, fileno) * dirsperblk; dirbuf.d_type = DT_DIR; } else { fileno = (uoff_t)offset / sizeof(struct direntry); dirbuf.d_type = DT_REG; } if (pmp->pm_flags & MSDOSFS_LARGEFS) { dirbuf.d_fileno = msdosfs_fileno_map(pmp->pm_mountp, fileno); } else dirbuf.d_fileno = (uint32_t)fileno; if (chksum != winChksum(dentp->deName)) { dirbuf.d_namlen = dos2unixfn(dentp->deName, (u_char *)dirbuf.d_name, dentp->deLowerCase | ((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ? (LCASE_BASE | LCASE_EXT) : 0), pmp); mbnambuf_init(&nb); } else mbnambuf_flush(&nb, &dirbuf); chksum = -1; dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf); if (uio->uio_resid < dirbuf.d_reclen) { brelse(bp); goto out; } error = uiomove(&dirbuf, dirbuf.d_reclen, uio); if (error) { brelse(bp); goto out; } if (cookies) { *cookies++ = offset + sizeof(struct direntry); if (--ncookies <= 0) { brelse(bp); goto out; } } off = offset + sizeof(struct direntry); } brelse(bp); } out: /* Subtract unused cookies */ if (ap->a_ncookies) *ap->a_ncookies -= ncookies; uio->uio_offset = off; /* * Set the eofflag (NFS uses it) */ if (ap->a_eofflag) { if (dep->de_FileSize - (offset - bias) <= 0) *ap->a_eofflag = 1; else *ap->a_eofflag = 0; } return (error); } /*- * a_vp - pointer to the file's vnode * a_bn - logical block number within the file (cluster number for us) * a_bop - where to return the bufobj of the special file containing the fs * a_bnp - where to return the "physical" block number corresponding to a_bn * (relative to the special file; units are blocks of size DEV_BSIZE) * a_runp - where to return the "run past" a_bn. This is the count of logical * blocks whose physical blocks (together with a_bn's physical block) * are contiguous. * a_runb - where to return the "run before" a_bn. */ static int msdosfs_bmap(struct vop_bmap_args *ap) { struct denode *dep; struct mount *mp; struct msdosfsmount *pmp; struct vnode *vp; daddr_t runbn; u_long cn; int bnpercn, error, maxio, maxrun, run; vp = ap->a_vp; dep = VTODE(vp); pmp = dep->de_pmp; if (ap->a_bop != NULL) *ap->a_bop = &pmp->pm_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; cn = ap->a_bn; if (cn != ap->a_bn) return (EFBIG); error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL); if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL)) return (error); mp = vp->v_mount; maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize; bnpercn = de_cn2bn(pmp, 1); if (ap->a_runp != NULL) { maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn); for (run = 1; run <= maxrun; run++) { if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp + run * bnpercn) break; } *ap->a_runp = run - 1; } if (ap->a_runb != NULL) { maxrun = ulmin(maxio - 1, cn); for (run = 1; run < maxrun; run++) { if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 || runbn != *ap->a_bnp - run * bnpercn) break; } *ap->a_runb = run - 1; } return (0); } +SYSCTL_NODE(_vfs, OID_AUTO, msdosfs, CTLFLAG_RW, 0, "msdos filesystem"); +static int use_buf_pager = 1; +SYSCTL_INT(_vfs_msdosfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, + &use_buf_pager, 0, + "Use buffer pager instead of bmap"); + +static daddr_t +msdosfs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) +{ + + return (de_cluster(VTODE(vp)->de_pmp, off)); +} + static int +msdosfs_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + + return (VTODE(vp)->de_pmp->pm_bpcluster); +} + +static int +msdosfs_getpages(struct vop_getpages_args *ap) +{ + + if (use_buf_pager) + return (vfs_bio_getpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, msdosfs_gbp_getblkno, + msdosfs_gbp_getblksz)); + return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); +} + +static int msdosfs_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct denode *dep = VTODE(ap->a_vp); struct bufobj *bo; int error = 0; daddr_t blkno; /* * If we don't already know the filesystem relative block number * then get it using pcbmap(). If pcbmap() returns the block * number as -1 then we've got a hole in the file. DOS filesystems * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if (bp->b_blkno == -1) { bufdone(bp); return (0); } /* * Read/write the block from/to the disk that contains the desired * file block. */ bp->b_iooffset = dbtob(bp->b_blkno); bo = dep->de_pmp->pm_bo; BO_STRATEGY(bo, bp); return (0); } static int msdosfs_print(struct vop_print_args *ap) { struct denode *dep = VTODE(ap->a_vp); printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ", dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset); printf("on dev %s\n", devtoname(dep->de_pmp->pm_dev)); return (0); } static int msdosfs_pathconf(struct vop_pathconf_args *ap) { struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = 1; return (0); case _PC_NAME_MAX: *ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } static int msdosfs_vptofh(struct vop_vptofh_args *ap) { struct denode *dep; struct defid *defhp; dep = VTODE(ap->a_vp); defhp = (struct defid *)ap->a_fhp; defhp->defid_len = sizeof(struct defid); defhp->defid_dirclust = dep->de_dirclust; defhp->defid_dirofs = dep->de_diroffset; /* defhp->defid_gen = dep->de_gen; */ return (0); } /* Global vfs data structures for msdosfs */ struct vop_vector msdosfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = msdosfs_access, .vop_bmap = msdosfs_bmap, + .vop_getpages = msdosfs_getpages, .vop_cachedlookup = msdosfs_lookup, .vop_open = msdosfs_open, .vop_close = msdosfs_close, .vop_create = msdosfs_create, .vop_fsync = msdosfs_fsync, .vop_fdatasync = vop_stdfdatasync_buf, .vop_getattr = msdosfs_getattr, .vop_inactive = msdosfs_inactive, .vop_link = msdosfs_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = msdosfs_mkdir, .vop_mknod = msdosfs_mknod, .vop_pathconf = msdosfs_pathconf, .vop_print = msdosfs_print, .vop_read = msdosfs_read, .vop_readdir = msdosfs_readdir, .vop_reclaim = msdosfs_reclaim, .vop_remove = msdosfs_remove, .vop_rename = msdosfs_rename, .vop_rmdir = msdosfs_rmdir, .vop_setattr = msdosfs_setattr, .vop_strategy = msdosfs_strategy, .vop_symlink = msdosfs_symlink, .vop_write = msdosfs_write, .vop_vptofh = msdosfs_vptofh, }; Index: user/alc/PQ_LAUNDRY/sys/i386/i386/mem.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/i386/i386/mem.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/i386/i386/mem.c (revision 308054) @@ -1,229 +1,234 @@ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1982, 1986, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and code derived from software contributed to * Berkeley by William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: mem.c 1.13 89/10/08$ * from: @(#)mem.c 7.2 (Berkeley) 5/9/91 */ #include __FBSDID("$FreeBSD$"); /* * Memory special file */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Used in /dev/mem drivers and elsewhere */ MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors"); static struct sx memsxlock; SX_SYSINIT(memsxlockinit, &memsxlock, "/dev/mem lock"); /* ARGSUSED */ int memrw(struct cdev *dev, struct uio *uio, int flags) { int o; u_int c = 0; vm_paddr_t pa; struct iovec *iov; int error = 0; vm_offset_t addr; if (dev2unit(dev) != CDEV_MINOR_MEM && dev2unit(dev) != CDEV_MINOR_KMEM) return EIO; if (dev2unit(dev) == CDEV_MINOR_KMEM && uio->uio_resid > 0) { if (uio->uio_offset < (vm_offset_t)VADDR(PTDPTDI, 0)) return (EFAULT); if (!kernacc((caddr_t)(int)uio->uio_offset, uio->uio_resid, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) return (EFAULT); } while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("memrw"); continue; } if (dev2unit(dev) == CDEV_MINOR_MEM) { - pa = uio->uio_offset; - pa &= ~PAGE_MASK; + if (uio->uio_offset > cpu_getmaxphyaddr()) { + error = EFAULT; + break; + } + pa = trunc_page(uio->uio_offset); } else { /* * Extract the physical page since the mapping may * change at any time. This avoids panics on page * fault in this case but will cause reading/writing * to the wrong page. * Hopefully an application will notice the wrong * data on read access and refrain from writing. * This should be replaced by a special uiomove * type function that just returns an error if there * is a page fault on a kernel page. */ addr = trunc_page(uio->uio_offset); pa = pmap_extract(kernel_pmap, addr); if (pa == 0) return EFAULT; } /* * XXX UPS This should just use sf_buf_alloc. * Unfortunately sf_buf_alloc needs a vm_page * and we may want to look at memory not covered * by the page array. */ sx_xlock(&memsxlock); pmap_kenter((vm_offset_t)ptvmmap, pa); pmap_invalidate_page(kernel_pmap,(vm_offset_t)ptvmmap); o = (int)uio->uio_offset & PAGE_MASK; c = PAGE_SIZE - o; c = min(c, (u_int)iov->iov_len); error = uiomove((caddr_t)&ptvmmap[o], (int)c, uio); pmap_qremove((vm_offset_t)ptvmmap, 1); sx_xunlock(&memsxlock); } return (error); } /* * allow user processes to MMAP some memory sections * instead of going through read/write */ /* ARGSUSED */ int memmmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int prot __unused, vm_memattr_t *memattr __unused) { if (dev2unit(dev) == CDEV_MINOR_MEM) { + if (offset > cpu_getmaxphyaddr()) + return (-1); *paddr = offset; return (0); } return (-1); } /* * Operations for changing memory attributes. * * This is basically just an ioctl shim for mem_range_attr_get * and mem_range_attr_set. */ /* ARGSUSED */ int memioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int flags, struct thread *td) { int nd, error = 0; struct mem_range_op *mo = (struct mem_range_op *)data; struct mem_range_desc *md; /* is this for us? */ if ((cmd != MEMRANGE_GET) && (cmd != MEMRANGE_SET)) return (ENOTTY); /* any chance we can handle this? */ if (mem_range_softc.mr_op == NULL) return (EOPNOTSUPP); /* do we have any descriptors? */ if (mem_range_softc.mr_ndesc == 0) return (ENXIO); switch (cmd) { case MEMRANGE_GET: nd = imin(mo->mo_arg[0], mem_range_softc.mr_ndesc); if (nd > 0) { md = (struct mem_range_desc *) malloc(nd * sizeof(struct mem_range_desc), M_MEMDESC, M_WAITOK); error = mem_range_attr_get(md, &nd); if (!error) error = copyout(md, mo->mo_desc, nd * sizeof(struct mem_range_desc)); free(md, M_MEMDESC); } else nd = mem_range_softc.mr_ndesc; mo->mo_arg[0] = nd; break; case MEMRANGE_SET: md = (struct mem_range_desc *)malloc(sizeof(struct mem_range_desc), M_MEMDESC, M_WAITOK); error = copyin(mo->mo_desc, md, sizeof(struct mem_range_desc)); /* clamp description string */ md->mr_owner[sizeof(md->mr_owner) - 1] = 0; if (error == 0) error = mem_range_attr_set(md, &mo->mo_arg[0]); free(md, M_MEMDESC); break; } return (error); } Index: user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c (revision 308054) @@ -1,5616 +1,5628 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ struct sysmaps { struct mtx lock; pt_entry_t *CMAP1; pt_entry_t *CMAP2; caddr_t CADDR1; caddr_t CADDR2; }; static struct sysmaps sysmaps_pcpu[MAXCPU]; pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct sysmaps *sysmaps; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ for (i = 0; i < MAXCPU; i++) { sysmaps = &sysmaps_pcpu[i]; mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) } SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_qpages(void) { struct pcpu *pc; int i; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the mfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); mfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Looks for a page table page mapping the specified virtual address in the * specified pmap's collection of idle page table pages. Returns NULL if there * is no page table page corresponding to the specified virtual address. */ static __inline vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); } /* * Removes the specified page table page from the specified pmap's collection * of idle page table pages. The specified page table page must be a member of * the pmap's collection. */ static __inline void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); vm_radix_remove(&pmap->pm_root, mpte->pindex); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != NULL) pmap_remove_pt_page(pmap, mpte); else { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); pmap_invalidate_page(pmap, trunc_4mpage(va)); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_lookup_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpde & PG_G) pmap_invalidate_page(kernel_pmap, sva); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_lookup_pt_page(pmap, sva); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if (oldpde & PG_MANAGED) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ + pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { + /* + * va is for UVA. + * In the case that a page table page is not resident, + * we are creating it here. pmap_allocpte() handles + * demotion. + */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } + } else { + /* + * va is for KVA, so pmap_demote_pde() will never fail + * to install a page table page. PG_V is also + * asserted by pmap_demote_pde(). + */ + KASSERT(pde != NULL && (*pde & PG_V) != 0, + ("KVA %#x invalid pde pdir %#jx", va, + (uintmax_t)pmap->pm_pdir[PTDPTDI])); + if ((*pde & PG_PS) != 0) + pmap_demote_pde(pmap, pde, va); } - - pde = pmap_pde(pmap, va); - if ((*pde & PG_PS) != 0) - panic("pmap_enter: attempted pmap_enter on 4MB page"); pte = pmap_pte_quick(pmap, va); /* - * Page Directory table entry not valid, we need a new PT page + * Page Directory table entry is not valid, which should not + * happen. We should have either allocated the page table + * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", - (uintmax_t)pmap->pm_pdir[PTDPTDI], va); + (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); pagezero(sysmaps->CADDR2); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_zero_page_area: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); if (off == 0 && size == PAGE_SIZE) pagezero(sysmaps->CADDR2); else bzero((char *)sysmaps->CADDR2 + off, size); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { struct sysmaps *sysmaps; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*sysmaps->CMAP2) panic("pmap_copy_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(sysmaps->CADDR1); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { struct sysmaps *sysmaps; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*sysmaps->CMAP2 != 0) panic("pmap_copy_pages: CMAP2 busy"); sched_pin(); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(sysmaps->CADDR1); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); a_cp = sysmaps->CADDR1 + a_pg_offset; b_cp = sysmaps->CADDR2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *sysmaps->CMAP1 = 0; *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_lookup_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) continue; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else continue; if ((*pte & PG_G) != 0) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { struct sysmaps *sysmaps; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("pmap_flush_page: CMAP2 busy"); sched_pin(); *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(sysmaps->CADDR2); sva = (vm_offset_t)sysmaps->CADDR2; eva = sva + PAGE_SIZE; /* * Use mfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt || cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *sysmaps->CMAP2 = 0; sched_unpin(); mtx_unlock(&sysmaps->lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_bio.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/vfs_bio.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/kern/vfs_bio.c (revision 308054) @@ -1,4746 +1,4902 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include #include +#include +#include +#include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; struct proc *bufspacedaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(struct vnode *vp, int); static int buf_recycle(bool); static int buf_scan(bool); static int flushbufqueues(struct vnode *, int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int); static void buf_release(void *, void **, int); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Physical memory used for buffers"); #endif static long bufkvaspace; SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0, "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh, 0, "Bufspace consumed before waking the daemon to free some"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "Target number of free buffers"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "Threshold for clean buffer recycling"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer acquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int numbufallocfails; SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct rwlock_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request/wakeup point for the bufspace daemon. */ static int bufspace_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(), * getnewbuf(), and getblk(). */ static volatile int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Maximum number of clean buffer queues. */ #define CLEAN_QUEUES 16 /* Configured number of clean queues. */ static int clean_queues; /* Maximum number of buffer queues. */ #define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES) /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Lock for each bufqueue */ static struct mtx_padalign bqlocks[BUFFER_QUEUES]; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif static int bqcleanq(void) { static int nextq; return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN); } static int bqisclean(int qindex) { return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES); } /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { return (struct mtx *)&bqlocks[qindex]; } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspace_wakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static void bufspace_wakeup(void) { /* * If someone is waiting for bufspace, wake them up. * * Since needsbuffer is set prior to doing an additional queue * scan it is safe to check for the flag prior to acquiring the * lock. The thread that is preparing to scan again before * blocking would discover the buf we released. */ if (needsbuffer) { rw_rlock(&nblock); if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); } } /* * bufspace_daemonwakeup: * * Wakeup the daemon responsible for freeing clean bufs. */ static void bufspace_daemonwakeup(void) { rw_rlock(&nblock); if (bufspace_request == 0) { bufspace_request = 1; wakeup(&bufspace_request); } rw_runlock(&nblock); } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bufspace, -diff); bufspace_wakeup(); } else { space = atomic_fetchadd_long(&bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + diff >= bufspacethresh) bufspace_daemonwakeup(); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(int size, bool metadata) { long limit; long space; if (metadata) limit = maxbufspace; else limit = hibufspace; do { space = bufspace; if (space + size > limit) return (ENOSPC); } while (atomic_cmpset_long(&bufspace, space, space + size) == 0); /* Wake up the daemon on the transition. */ if (space < bufspacethresh && space + size >= bufspacethresh) bufspace_daemonwakeup(); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(int size) { atomic_subtract_long(&bufspace, size); bufspace_wakeup(); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. needsbuffer must be set in a safe fashion prior to * polling for space. The operation must be re-tried on return. */ static void bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; rw_wlock(&nblock); while (needsbuffer != 0) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { rw_wunlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, flushbufqtarget); td->td_pflags &= norunbuf; rw_wlock(&nblock); if (fl != 0) continue; if (needsbuffer == 0) break; } error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } rw_wunlock(&nblock); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void) { for (;;) { kproc_suspend_check(bufspacedaemonproc); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bufspace > lobufspace || numfreebuffers < hifreebuffers) { if (buf_recycle(false) != 0) { atomic_set_int(&needsbuffer, 1); if (buf_recycle(false) != 0) { rw_wlock(&nblock); if (needsbuffer) rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, PRIBIO|PDROP, "bufspace", hz/10); else rw_wunlock(&nblock); } } maybe_yield(); } /* * Re-check our limits under the exclusive nblock. */ rw_wlock(&nblock); if (bufspace < bufspacethresh && numfreebuffers > lofreebuffers) { bufspace_request = 0; rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP, "-", hz); } else rw_wunlock(&nblock); } } static struct kproc_desc bufspace_kp = { "bufspacedaemon", bufspace_daemon, &bufspacedaemonproc }; SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &bufspace_kp); /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = min(nbuf / 4, 256); TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; CTASSERT(MAXBCACHEBUF >= MAXBSIZE); mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF); mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF); for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++) mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); rw_init(&nblock, "needsbuffer lock"); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Drop Giant and spin for a while to allow * interrupt threads to run. */ DROP_GIANT(); DELAY(50000 * iter); PICKUP_GIANT(); #else /* * Drop Giant and context switch several times to * allow interrupt threads to run. */ DROP_GIANT(); for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } PICKUP_GIANT(); #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; if (qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } /* * Stick to the same clean queue for the lifetime of the buf to * limit locking below. Otherwise pick ont sequentially. */ if (qindex == QUEUE_CLEAN) { if (bqisclean(bp->b_qindex)) qindex = bp->b_qindex; else qindex = bqcleanq(); } /* * Handle delayed bremfree() processing. */ nlock = bqlock(qindex); if (bp->b_flags & B_REMFREE) { olock = bqlock(bp->b_qindex); mtx_lock(olock); bremfreel(bp); if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } } else mtx_lock(nlock); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); atomic_add_int(&numfreebuffers, 1); bufspace_wakeup(); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int flags) { struct buf *bp; int i; mtx_lock(&bqlocks[QUEUE_EMPTY]); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); if (bp == NULL) break; bremfreel(bp); store[i] = bp; } mtx_unlock(&bqlocks[QUEUE_EMPTY]); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { int i; for (i = 0; i < cnt; i++) binsfree(store[i], QUEUE_EMPTY); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(void) { struct buf *bp; bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { bufspace_daemonwakeup(); atomic_add_int(&numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition. */ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers) bufspace_daemonwakeup(); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_qrecycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_qrecycle(int qindex, bool kva) { struct buf *bp, *nbp; if (kva) atomic_add_int(&bufdefragcnt, 1); nbp = NULL; mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bremfreel(bp); mtx_unlock(&bqlocks[qindex]); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); mtx_lock(&bqlocks[qindex]); nbp = TAILQ_FIRST(&bufqueues[qindex]); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } mtx_unlock(&bqlocks[qindex]); return (ENOBUFS); } /* * buf_recycle: * * Iterate through all clean queues until we find a buf to recycle or * exhaust the search. */ static int buf_recycle(bool kva) { int qindex, first_qindex; qindex = first_qindex = bqcleanq(); do { if (buf_qrecycle(qindex, kva) == 0) return (0); if (++qindex == QUEUE_CLEAN + clean_queues) qindex = QUEUE_CLEAN; } while (qindex != first_qindex); return (ENOBUFS); } /* * buf_scan: * * Scan the clean queues looking for a buffer to recycle. needsbuffer * is set on failure so that the caller may optionally bufspace_wait() * in a race-free fashion. */ static int buf_scan(bool defrag) { int error; /* * To avoid heavy synchronization and wakeup races we set * needsbuffer and re-poll before failing. This ensures that * no frees can be missed between an unsuccessful poll and * going to sleep in a synchronized fashion. */ if ((error = buf_recycle(defrag)) != 0) { atomic_set_int(&needsbuffer, 1); bufspace_daemonwakeup(); error = buf_recycle(defrag); } if (error == 0) atomic_add_int(&getnewbufrestarts, 1); return (error); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~B_REMFREE; } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); atomic_subtract_long(&bufkvaspace, bp->b_kvasize); atomic_add_int(&buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; atomic_add_long(&bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { int i; for (i = 0; i < 5; i++) if (buf_scan(true) != 0) break; return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: /* unlock */ BUF_UNLOCK(bp); if (qindex == QUEUE_CLEAN) bufspace_wakeup(); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int bogus, i, iosize; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = 0; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Unwire a page held by a buf and place it on the appropriate vm queue. */ static void vfs_vmio_unwire(struct buf *bp, vm_page_t m) { bool freed; vm_page_lock(m); if (vm_page_unwire(m, PQ_NONE)) { /* * Determine if the page should be freed before adding * it to the inactive queue. */ if (m->valid == 0) { freed = !vm_page_busied(m); if (freed) vm_page_free(m); } else if ((bp->b_flags & B_DIRECT) != 0) freed = vm_page_try_to_free(m); else freed = false; if (!freed) { /* * If the page is unlikely to be reused, let the * VM know. Otherwise, maintain LRU page * ordering and put the page at the tail of the * inactive queue. */ if ((bp->b_flags & B_NOREUSE) != 0) vm_page_deactivate_noreuse(m); else vm_page_deactivate(m); } } vm_page_unlock(m); } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh", true); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vfs_vmio_unwire(bp, m); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); obj = bp->b_bufobj->bo_object; if (obj != NULL) VM_OBJECT_WLOCK(obj); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; vfs_vmio_unwire(bp, m); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; atomic_add_int(&getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(maxsize, metadata) != 0) continue; reserved = true; if ((bp = buf_alloc()) == NULL) continue; if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while(buf_scan(false) == 0); if (reserved) atomic_subtract_long(&bufspace, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, int target) { int flushed; flushed = flushbufqueues(vp, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(NULL, numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; bool unlock; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqlocks[queue]); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); while (flushed != target) { maybe_yield(); mtx_lock(&bqlocks[queue]); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqlocks[queue]); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { mtx_unlock(&bqlocks[queue]); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqlocks[queue]); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); notbufdflushes++; } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqlocks[queue]); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqlocks[queue]); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); bufspace_wait(bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBCACHEBUF) panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size, MAXBCACHEBUF); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); bufspace_release(maxsize); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; BUF_ASSERT_HELD(bp); if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, OFF_TO_IDX(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else { bp->bio_flags |= BIO_DONE; done(bp); } } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage", true); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); p->wire_count--; vm_page_free(p); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } +} + +static int buf_pager_relbuf; +SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, + &buf_pager_relbuf, 0, + "Make buffer pager release buffers after reading"); + +/* + * The buffer pager. It uses buffer reads to validate pages. + * + * In contrast to the generic local pager from vm/vnode_pager.c, this + * pager correctly and easily handles volumes where the underlying + * device block size is greater than the machine page size. The + * buffer cache transparently extends the requested page run to be + * aligned at the block boundary, and does the necessary bogus page + * replacements in the addends to avoid obliterating already valid + * pages. + * + * The only non-trivial issue is that the exclusive busy state for + * pages, which is assumed by the vm_pager_getpages() interface, is + * incompatible with the VMIO buffer cache's desire to share-busy the + * pages. This function performs a trivial downgrade of the pages' + * state before reading buffers, and a less trivial upgrade from the + * shared-busy to excl-busy state after the read. + */ +int +vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, + int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, + vbg_get_blksize_t get_blksize) +{ + vm_page_t m; + vm_object_t object; + struct buf *bp; + daddr_t lbn, lbnp; + vm_ooffset_t la, lb, poff, poffe; + long bsize; + int bo_bs, error, i; + bool redo, lpart; + + object = vp->v_object; + la = IDX_TO_OFF(ma[count - 1]->pindex); + if (la >= object->un_pager.vnp.vnp_size) + return (VM_PAGER_BAD); + lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; + bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); + if (rbehind != NULL) { + lb = IDX_TO_OFF(ma[0]->pindex); + *rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); + } + if (rahead != NULL) { + *rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la); + if (la + IDX_TO_OFF(*rahead) >= object->un_pager.vnp.vnp_size) { + *rahead = OFF_TO_IDX(roundup2(object->un_pager. + vnp.vnp_size, PAGE_SIZE) - la); + } + } + VM_OBJECT_WLOCK(object); +again: + for (i = 0; i < count; i++) + vm_page_busy_downgrade(ma[i]); + VM_OBJECT_WUNLOCK(object); + + lbnp = -1; + for (i = 0; i < count; i++) { + m = ma[i]; + + /* + * Pages are shared busy and the object lock is not + * owned, which together allow for the pages' + * invalidation. The racy test for validity avoids + * useless creation of the buffer for the most typical + * case when invalidation is not used in redo or for + * parallel read. The shared->excl upgrade loop at + * the end of the function catches the race in a + * reliable way (protected by the object lock). + */ + if (m->valid == VM_PAGE_BITS_ALL) + continue; + + poff = IDX_TO_OFF(m->pindex); + poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); + for (; poff < poffe; poff += bsize) { + lbn = get_lblkno(vp, poff); + if (lbn == lbnp) + goto next_page; + lbnp = lbn; + + bsize = get_blksize(vp, lbn); + error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED, + &bp); + if (error != 0) + goto end_pages; + if (LIST_EMPTY(&bp->b_dep)) { + /* + * Invalidation clears m->valid, but + * may leave B_CACHE flag if the + * buffer existed at the invalidation + * time. In this case, recycle the + * buffer to do real read on next + * bread() after redo. + * + * Otherwise B_RELBUF is not strictly + * necessary, enable to reduce buf + * cache pressure. + */ + if (buf_pager_relbuf || + m->valid != VM_PAGE_BITS_ALL) + bp->b_flags |= B_RELBUF; + + bp->b_flags &= ~B_NOCACHE; + brelse(bp); + } else { + bqrelse(bp); + } + } + KASSERT(1 /* racy, enable for debugging */ || + m->valid == VM_PAGE_BITS_ALL || i == count - 1, + ("buf %d %p invalid", i, m)); + if (i == count - 1 && lpart) { + VM_OBJECT_WLOCK(object); + if (m->valid != 0 && + m->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(m, TRUE); + VM_OBJECT_WUNLOCK(object); + } +next_page:; + } +end_pages: + + VM_OBJECT_WLOCK(object); + redo = false; + for (i = 0; i < count; i++) { + vm_page_sunbusy(ma[i]); + ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); + + /* + * Since the pages were only sbusy while neither the + * buffer nor the object lock was held by us, or + * reallocated while vm_page_grab() slept for busy + * relinguish, they could have been invalidated. + * Recheck the valid bits and re-read as needed. + * + * Note that the last page is made fully valid in the + * read loop, and partial validity for the page at + * index count - 1 could mean that the page was + * invalidated or removed, so we must restart for + * safety as well. + */ + if (ma[i]->valid != VM_PAGE_BITS_ALL) + redo = true; + } + if (redo && error == 0) + goto again; + VM_OBJECT_WUNLOCK(object); + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: user/alc/PQ_LAUNDRY/sys/modules/hyperv/netvsc/Makefile =================================================================== --- user/alc/PQ_LAUNDRY/sys/modules/hyperv/netvsc/Makefile (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/modules/hyperv/netvsc/Makefile (revision 308054) @@ -1,14 +1,14 @@ # $FreeBSD$ .PATH: ${.CURDIR}/../../../dev/hyperv/netvsc \ ${.CURDIR}/../../../dev/hyperv/vmbus KMOD= hv_netvsc -SRCS= hv_net_vsc.c \ +SRCS= hn_nvs.c \ hv_netvsc_drv_freebsd.c \ hv_rndis_filter.c SRCS+= bus_if.h device_if.h opt_inet.h opt_inet6.h vmbus_if.h CFLAGS+= -I${.CURDIR}/../../../dev/hyperv/netvsc .include Index: user/alc/PQ_LAUNDRY/sys/net/netmap.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/net/netmap.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/net/netmap.h (revision 308054) @@ -1,672 +1,648 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * * Definitions of constants and the structures used by the netmap * framework, for the part visible to both kernel and userspace. * Detailed info on netmap is available with "man netmap" or at * * http://info.iet.unipi.it/~luigi/netmap/ * * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ #define NETMAP_API 11 /* current API version */ #define NETMAP_MIN_API 11 /* min and max versions accepted */ #define NETMAP_MAX_API 15 /* * Some fields should be cache-aligned to reduce contention. * The alignment is architecture and OS dependent, but rather than * digging into OS headers to find the exact value we use an estimate * that should cover most architectures. */ #define NM_CACHE_ALIGN 128 /* * --- Netmap data structures --- * * The userspace data structures used by netmap are shown below. * They are allocated by the kernel and mmap()ed by userspace threads. * Pointers are implemented as memory offsets or indexes, * so that they can be easily dereferenced in kernel and userspace. KERNEL (opaque, obviously) ==================================================================== | USERSPACE | struct netmap_ring +---->+---------------+ / | head,cur,tail | struct netmap_if (nifp, 1 per fd) / | buf_ofs | +---------------+ / | other fields | | ni_tx_rings | / +===============+ | ni_rx_rings | / | buf_idx, len | slot[0] | | / | flags, ptr | | | / +---------------+ +===============+ / | buf_idx, len | slot[1] | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | | txring_ofs[1] | +---------------+ (tx+1 entries) (num_slots entries) | txring_ofs[t] | | buf_idx, len | slot[n-1] +---------------+ | flags, ptr | | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | (rx+1 entries) | rxring_ofs[r] | +---------------+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. * VALE switch ports instead have separate memory regions. * * The netmap_ring is the userspace-visible replica of the NIC ring. * Each slot has the index of a buffer (MTU-sized and residing in the * mmapped region), its length and some flags. An extra 64-bit pointer * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE * * Added in NETMAP_API 11: * * + NIOCREGIF can request the allocation of extra spare buffers from * the same memory pool. The desired number of buffers must be in * nr_arg3. The ioctl may return fewer buffers, depending on memory * availability. nr_arg3 will return the actual value, and, once * mapped, nifp->ni_bufs_head will be the index of the first buffer. * * The buffers are linked to each other using the first uint32_t * as the index. On close, ni_bufs_head must point to the list of * buffers to be released. * * + NIOCREGIF can request space for extra rings (and buffers) * allocated in the same memory space. The number of extra rings * is in nr_arg1, and is advisory. This is a no-op on NICs where * the size of the memory space is fixed. * * + NIOCREGIF can attach to PIPE rings sharing the same memory * space with a parent device. The ifname indicates the parent device, * which must already exist. Flags in nr_flags indicate if we want to * bind the master or slave side, the index (from nr_ringid) * is just a cookie and does not need to be sequential. * * + NIOCREGIF can also attach to 'monitor' rings that replicate * the content of specific rings, also from the same memory space. * * Extra flags in nr_flags support the above functions. * Application libraries may use the following naming scheme: * netmap:foo all NIC ring pairs * netmap:foo^ only host ring pair * netmap:foo+ all NIC ring + host ring pairs * netmap:foo-k the k-th NIC ring pair * netmap:foo{k PIPE ring pair k, master side * netmap:foo}k PIPE ring pair k, slave side * * Some notes about host rings: * * + The RX host ring is used to store those packets that the host network * stack is trying to transmit through a NIC queue, but only if that queue * is currently in netmap mode. Netmap will not intercept host stack mbufs * designated to NIC queues that are not in netmap mode. As a consequence, * registering a netmap port with netmap:foo^ is not enough to intercept * mbufs in the RX host ring; the netmap port should be registered with * netmap:foo*, or another registration should be done to open at least a * NIC TX queue in netmap mode. * * + Netmap is not currently able to deal with intercepted trasmit mbufs which * require offloadings like TSO, UFO, checksumming offloadings, etc. It is * responsibility of the user to disable those offloadings (e.g. using * ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being * used in netmap mode. If the offloadings are not disabled, GSO and/or * unchecksummed packets may be dropped immediately or end up in the host RX * ring, and will be dropped as soon as the packet reaches another netmap * adapter. */ /* * struct netmap_slot is a buffer descriptor */ struct netmap_slot { uint32_t buf_idx; /* buffer index */ uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ uint64_t ptr; /* pointer for indirect buffers */ }; /* * The following flags control how the slot is used */ #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ /* * must be set whenever buf_idx is changed (as it might be * necessary to recompute the physical address and mapping) * * It is also set by the kernel whenever the buf_idx is * changed internally (e.g., by pipes). Applications may * use this information to know when they can reuse the * contents of previously prepared buffers. */ #define NS_REPORT 0x0002 /* ask the hardware to report results */ /* * Request notification when slot is used by the hardware. * Normally transmit completions are handled lazily and * may be unreported. This flag lets us know when a slot * has been sent (e.g. to terminate the sender). */ #define NS_FORWARD 0x0004 /* pass packet 'forward' */ /* * (Only for physical ports, rx rings with NR_FORWARD set). * Slot released to the kernel (i.e. before ring->head) with * this flag set are passed to the peer ring (host/NIC), * thus restoring the host-NIC connection for these slots. * This supports efficient traffic monitoring or firewalling. */ #define NS_NO_LEARN 0x0008 /* disable bridge learning */ /* * On a VALE switch, do not 'learn' the source port for * this buffer. */ #define NS_INDIRECT 0x0010 /* userspace buffer */ /* * (VALE tx rings only) data is in a userspace buffer, * whose address is in the 'ptr' field in the slot. */ #define NS_MOREFRAG 0x0020 /* packet has more fragments */ /* * (VALE ports only) * Set on all but the last slot of a multi-segment packet. * The 'len' field refers to the individual fragment. */ #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) /* * The high 8 bits of the flag, if not zero, indicate the * destination port for the VALE switch, overriding * the lookup table. */ #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) /* * (VALE rx rings only) the high 8 bits * are the number of fragments. */ /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. * At the software level the important fields are: head, cur, tail. * * In TX rings: * * head first slot available for transmission. * cur wakeup point. select() and poll() will unblock * when 'tail' moves past 'cur' * tail (readonly) first slot reserved to the kernel * * [head .. tail-1] can be used for new packets to send; * 'head' and 'cur' must be incremented as slots are filled * with new packets to be sent; * 'cur' can be moved further ahead if we need more space * for new transmissions. XXX todo (2014-03-12) * * In RX rings: * * head first valid received packet * cur wakeup point. select() and poll() will unblock * when 'tail' moves past 'cur' * tail (readonly) first slot reserved to the kernel * * [head .. tail-1] contain received packets; * 'head' and 'cur' must be incremented as slots are consumed * and can be returned to the kernel; * 'cur' can be moved further ahead if we want to wait for * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: * The netmap_ring, and all slots and buffers in the range * [head .. tail-1] are owned by the user program; * the kernel only accesses them during a netmap system call * and in the user thread context. * * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ const uint32_t nr_buf_size; const uint16_t ringid; const uint16_t dir; /* 0: tx, 1: rx */ uint32_t head; /* (u) first user slot */ uint32_t cur; /* (u) wakeup point */ uint32_t tail; /* (k) first kernel slot */ uint32_t flags; struct timeval ts; /* (k) time of last *sync() */ /* opaque room for a mutex or similar object */ #if !defined(_WIN32) || defined(__CYGWIN__) uint8_t __attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128]; #else uint8_t __declspec(align(NM_CACHE_ALIGN)) sem[128]; #endif /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ }; /* * RING FLAGS */ #define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ /* * updates the 'ts' field on each netmap syscall. This saves * saves a separate gettimeofday(), and is not much worse than * software timestamps generated in the interrupt handler. */ #define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ /* * Enables the NS_FORWARD slot flag for the ring. */ /* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file * descriptor to a port, and should be considered as readonly * by user programs. The kernel never uses it. * * There is one netmap_if for each file descriptor on which we want * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ const uint32_t ni_version; /* API version, currently unused */ const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ /* * The number of packet rings available in netmap mode. * Physical NICs can have different numbers of tx and rx rings. * Physical NICs also have a 'host' ring pair. * Additionally, clients can request additional ring pairs to * be used for internal communication. */ const uint32_t ni_tx_rings; /* number of HW tx rings */ const uint32_t ni_rx_rings; /* number of HW rx rings */ uint32_t ni_bufs_head; /* head index for extra bufs */ uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring * from this structure, in the following order: * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; #ifndef NIOCREGIF /* * ioctl names and related fields * * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, * whose identity is set in NIOCREGIF through nr_ringid. * These are non blocking and take no argument. * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). * The info returned is only advisory and may change before * the interface is bound to a file descriptor. * * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we * can pass it down to other NIC-related ioctls. * * The actual argument (struct nmreq) has a number of options to request * different functions. * The following are used in NIOCREGIF when nr_cmd == 0: * * nr_name (in) * The name of the port (em0, valeXXX:YYY, etc.) * limited to IFNAMSIZ for backward compatibility. * * nr_version (in/out) * Must match NETMAP_API as used in the kernel, error otherwise. * Always returns the desired value on output. * * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) * On input, non-zero values may be used to reconfigure the port * according to the requested values, but this is not guaranteed. * On output the actual values in use are reported. * * nr_ringid (in) * Indicates how rings should be bound to the file descriptors. * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) * are used to indicate the ring number, and nr_flags specifies * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. * * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control * the binding as follows: * 0 (default) binds all physical rings * NETMAP_HW_RING | ring number binds a single ring pair * NETMAP_SW_RING binds only the host tx/rx rings * * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push * packets on tx rings only if POLLOUT is set. * The default is to push any pending packet. * * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release * packets on rx rings also when POLLIN is NOT set. * The default is to touch the rx ring only with POLLIN. * Note that this is the opposite of TX because it * reflects the common usage. * * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. * NETMAP_PRIV_MEM is set on return for ports that do not use * the global memory allocator. * This information is not significant and applications * should look at the region id in nr_arg2 * * nr_flags is the recommended mode to indicate which rings should * be bound to a file descriptor. Values are NR_REG_* * * nr_arg1 (in) The number of extra rings to be reserved. * Especially when allocating a VALE port the system only * allocates the amount of memory needed for the port. * If more shared memory rings are desired (e.g. for pipes), * the first invocation for the same basename/allocator * should specify a suitable number. Memory cannot be * extended after the first allocation without closing * all ports on the same region. * * nr_arg2 (in/out) The identity of the memory region used. * On input, 0 means the system decides autonomously, * other values may try to select a specific region. * On return the actual value is reported. * Region '1' is the global allocator, normally shared * by all interfaces. Other values are private regions. * If two ports the same region zero-copy is possible. * * nr_arg3 (in/out) number of extra buffers to be allocated. * * * * nr_cmd (in) if non-zero indicates a special command: * NETMAP_BDG_ATTACH and nr_name = vale*:ifname * attaches the NIC to the switch; nr_ringid specifies * which rings to use. Used by vale-ctl -a ... * nr_arg1 = NETMAP_BDG_HOST also attaches the host port * as in vale-ctl -h ... * * NETMAP_BDG_DETACH and nr_name = vale*:ifname * disconnects a previously attached NIC. * Used by vale-ctl -d ... * * NETMAP_BDG_LIST * list the configuration of VALE switches. * * NETMAP_BDG_VNET_HDR * Set the virtio-net header length used by the client * of a VALE switch port. * * NETMAP_BDG_NEWIF * create a persistent VALE port with name nr_name. * Used by vale-ctl -n ... * * NETMAP_BDG_DELIF * delete a persistent VALE port. Used by vale-ctl -d ... * * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific * * * */ /* * struct nmreq overlays a struct ifreq (just the name) */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ uint16_t nr_ringid; /* ring(s) we care about */ #define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ #define NETMAP_SW_RING 0x2000 /* only host ring pair */ #define NETMAP_RING_MASK 0x0fff /* the ring number */ #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ #define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_REGOPS 3 /* register bridge callbacks */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ #define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ #define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ #define NETMAP_BDG_NEWIF 6 /* create a virtual port */ #define NETMAP_BDG_DELIF 7 /* destroy a virtual port */ #define NETMAP_PT_HOST_CREATE 8 /* create ptnetmap kthreads */ #define NETMAP_PT_HOST_DELETE 9 /* delete ptnetmap kthreads */ #define NETMAP_BDG_POLLING_ON 10 /* delete polling kthread */ #define NETMAP_BDG_POLLING_OFF 11 /* delete polling kthread */ #define NETMAP_VNET_HDR_GET 12 /* get the port virtio-net-hdr length */ +#define NETMAP_POOLS_INFO_GET 13 /* get memory allocator pools info */ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ uint16_t nr_arg2; uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ uint32_t nr_flags; /* various modes, extends nr_ringid */ uint32_t spare2[1]; }; #define NR_REG_MASK 0xf /* values for nr_flags */ enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ NR_REG_ALL_NIC = 1, NR_REG_SW = 2, NR_REG_NIC_SW = 3, NR_REG_ONE_NIC = 4, NR_REG_PIPE_MASTER = 5, NR_REG_PIPE_SLAVE = 6, }; /* monitor uses the NR_REG to select the rings to monitor */ #define NR_MONITOR_TX 0x100 #define NR_MONITOR_RX 0x200 #define NR_ZCOPY_MON 0x400 /* request exclusive access to the selected rings */ #define NR_EXCLUSIVE 0x800 /* request ptnetmap host support */ #define NR_PASSTHROUGH_HOST NR_PTNETMAP_HOST /* deprecated */ #define NR_PTNETMAP_HOST 0x1000 #define NR_RX_RINGS_ONLY 0x2000 #define NR_TX_RINGS_ONLY 0x4000 /* Applications set this flag if they are able to deal with virtio-net headers, * that is send/receive frames that start with a virtio-net header. * If not set, NIOCREGIF will fail with netmap ports that require applications * to use those headers. If the flag is set, the application can use the * NETMAP_VNET_HDR_GET command to figure out the header length. */ #define NR_ACCEPT_VNET_HDR 0x8000 #define NM_BDG_NAME "vale" /* prefix for bridge port name */ /* * Windows does not have _IOWR(). _IO(), _IOW() and _IOR() are defined * in ws2def.h but not sure if they are in the form we need. * XXX so we redefine them * in a convenient way to use for DeviceIoControl signatures */ #ifdef _WIN32 #undef _IO // ws2def.h #define _WIN_NM_IOCTL_TYPE 40000 #define _IO(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ METHOD_BUFFERED, FILE_ANY_ACCESS ) #define _IO_direct(_c, _n) CTL_CODE(_WIN_NM_IOCTL_TYPE, ((_n) + 0x800) , \ METHOD_OUT_DIRECT, FILE_ANY_ACCESS ) #define _IOWR(_c, _n, _s) _IO(_c, _n) /* We havesome internal sysctl in addition to the externally visible ones */ #define NETMAP_MMAP _IO_direct('i', 160) // note METHOD_OUT_DIRECT #define NETMAP_POLL _IO('i', 162) /* and also two setsockopt for sysctl emulation */ #define NETMAP_SETSOCKOPT _IO('i', 140) #define NETMAP_GETSOCKOPT _IO('i', 141) //These linknames are for the Netmap Core Driver #define NETMAP_NT_DEVICE_NAME L"\\Device\\NETMAP" #define NETMAP_DOS_DEVICE_NAME L"\\DosDevices\\netmap" //Definition of a structure used to pass a virtual address within an IOCTL typedef struct _MEMORY_ENTRY { PVOID pUsermodeVirtualAddress; } MEMORY_ENTRY, *PMEMORY_ENTRY; typedef struct _POLL_REQUEST_DATA { int events; int timeout; int revents; } POLL_REQUEST_DATA; #endif /* _WIN32 */ /* * FreeBSD uses the size value embedded in the _IOWR to determine * how much to copy in/out. So we need it to match the actual * data structure we pass. We put some spares in the structure * to ease compatibility with other versions */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #define NIOCCONFIG _IOWR('i',150, struct nm_ifreq) /* for ext. modules */ #endif /* !NIOCREGIF */ /* * Helper functions for kernel and userspace */ /* * check if space is available in the ring. */ static inline int nm_ring_empty(struct netmap_ring *ring) { return (ring->cur == ring->tail); } /* * Opaque structure that is passed to an external kernel * module via ioctl(fd, NIOCCONFIG, req) for a user-owned * bridge port (at this point ephemeral VALE interface). */ #define NM_IFRDATA_LEN 256 struct nm_ifreq { char nifr_name[IFNAMSIZ]; char data[NM_IFRDATA_LEN]; }; -/* - * netmap kernel thread configuration - */ -/* bhyve/vmm.ko MSIX parameters for IOCTL */ -struct ptn_vmm_ioctl_msix { - uint64_t msg; - uint64_t addr; -}; - -/* IOCTL parameters */ -struct nm_kth_ioctl { - uint64_t com; - /* We use union to support more ioctl commands. */ - union { - struct ptn_vmm_ioctl_msix msix; - } data; -}; - -/* Configuration of a ptnetmap ring */ -struct ptnet_ring_cfg { - uint64_t ioeventfd; /* eventfd in linux, tsleep() parameter in FreeBSD */ - uint64_t irqfd; /* eventfd in linux, ioctl fd in FreeBSD */ - struct nm_kth_ioctl ioctl; /* ioctl parameter to send irq (only used in bhyve/FreeBSD) */ - uint64_t reserved[4]; /* reserved to support of more hypervisors */ -}; #endif /* _NET_NETMAP_H_ */ Index: user/alc/PQ_LAUNDRY/sys/net/netmap_virt.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/net/netmap_virt.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/net/netmap_virt.h (revision 308054) @@ -1,280 +1,305 @@ /* * Copyright (C) 2013-2016 Luigi Rizzo * Copyright (C) 2013-2016 Giuseppe Lettieri * Copyright (C) 2013-2016 Vincenzo Maffione * Copyright (C) 2015 Stefano Garzarella * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef NETMAP_VIRT_H #define NETMAP_VIRT_H -#define NETMAP_VIRT_CSB_SIZE 4096 - -/* ptnetmap features */ -#define PTNETMAP_F_BASE 1 -#define PTNETMAP_F_FULL 2 /* not used */ -#define PTNETMAP_F_VNET_HDR 4 - /* * ptnetmap_memdev: device used to expose memory into the guest VM * * These macros are used in the hypervisor frontend (QEMU, bhyve) and in the * guest device driver. */ /* PCI identifiers and PCI BARs for the ptnetmap memdev * and ptnetmap network interface. */ #define PTNETMAP_MEMDEV_NAME "ptnetmap-memdev" -#define PTNETMAP_PCI_VENDOR_ID 0x3333 /* TODO change vendor_id */ -#define PTNETMAP_PCI_DEVICE_ID 0x0001 /* memory device */ -#define PTNETMAP_PCI_NETIF_ID 0x0002 /* ptnet network interface */ +#define PTNETMAP_PCI_VENDOR_ID 0x1b36 /* QEMU virtual devices */ +#define PTNETMAP_PCI_DEVICE_ID 0x000c /* memory device */ +#define PTNETMAP_PCI_NETIF_ID 0x000d /* ptnet network interface */ #define PTNETMAP_IO_PCI_BAR 0 #define PTNETMAP_MEM_PCI_BAR 1 #define PTNETMAP_MSIX_PCI_BAR 2 /* Registers for the ptnetmap memdev */ -/* 32 bit r/o */ -#define PTNETMAP_IO_PCI_MEMSIZE 0 /* size of the netmap memory shared - * between guest and host */ -/* 16 bit r/o */ -#define PTNETMAP_IO_PCI_HOSTID 4 /* memory allocator ID in netmap host */ -#define PTNETMAP_IO_SIZE 6 +#define PTNET_MDEV_IO_MEMSIZE_LO 0 /* netmap memory size (low) */ +#define PTNET_MDEV_IO_MEMSIZE_HI 4 /* netmap_memory_size (high) */ +#define PTNET_MDEV_IO_MEMID 8 /* memory allocator ID in the host */ +#define PTNET_MDEV_IO_IF_POOL_OFS 64 +#define PTNET_MDEV_IO_IF_POOL_OBJNUM 68 +#define PTNET_MDEV_IO_IF_POOL_OBJSZ 72 +#define PTNET_MDEV_IO_RING_POOL_OFS 76 +#define PTNET_MDEV_IO_RING_POOL_OBJNUM 80 +#define PTNET_MDEV_IO_RING_POOL_OBJSZ 84 +#define PTNET_MDEV_IO_BUF_POOL_OFS 88 +#define PTNET_MDEV_IO_BUF_POOL_OBJNUM 92 +#define PTNET_MDEV_IO_BUF_POOL_OBJSZ 96 +#define PTNET_MDEV_IO_END 100 /* * ptnetmap configuration * - * The hypervisor (QEMU or bhyve) sends this struct to the host netmap - * module through an ioctl() command when it wants to start the ptnetmap - * kthreads. + * The ptnet kthreads (running in host kernel-space) need to be configured + * in order to know how to intercept guest kicks (I/O register writes) and + * how to inject MSI-X interrupts to the guest. The configuration may vary + * depending on the hypervisor. Currently, we support QEMU/KVM on Linux and + * and bhyve on FreeBSD. + * The configuration is passed by the hypervisor to the host netmap module + * by means of an ioctl() with nr_cmd=NETMAP_PT_HOST_CREATE, and it is + * specified by the ptnetmap_cfg struct. This struct contains an header + * with general informations and an array of entries whose size depends + * on the hypervisor. The NETMAP_PT_HOST_CREATE command is issued every + * time the kthreads are started. */ struct ptnetmap_cfg { -#define PTNETMAP_CFG_FEAT_CSB 0x0001 -#define PTNETMAP_CFG_FEAT_EVENTFD 0x0002 -#define PTNETMAP_CFG_FEAT_IOCTL 0x0004 - uint32_t features; - void *ptrings; /* ptrings inside CSB */ - uint32_t num_rings; /* number of entries */ - struct ptnet_ring_cfg entries[0]; /* per-ptring configuration */ +#define PTNETMAP_CFGTYPE_QEMU 0x1 +#define PTNETMAP_CFGTYPE_BHYVE 0x2 + uint16_t cfgtype; /* how to interpret the cfg entries */ + uint16_t entry_size; /* size of a config entry */ + uint32_t num_rings; /* number of config entries */ + void *ptrings; /* ptrings inside CSB */ + /* Configuration entries are allocated right after the struct. */ }; +/* Configuration of a ptnetmap ring for QEMU. */ +struct ptnetmap_cfgentry_qemu { + uint32_t ioeventfd; /* to intercept guest register access */ + uint32_t irqfd; /* to inject guest interrupts */ +}; + +/* Configuration of a ptnetmap ring for bhyve. */ +struct ptnetmap_cfgentry_bhyve { + uint64_t wchan; /* tsleep() parameter, to wake up kthread */ + uint32_t ioctl_fd; /* ioctl fd */ + /* ioctl parameters to send irq */ + uint32_t ioctl_cmd; + /* vmm.ko MSIX parameters for IOCTL */ + struct { + uint64_t msg_data; + uint64_t addr; + } ioctl_data; +}; + /* - * Functions used to write ptnetmap_cfg from/to the nmreq. - * The user-space application writes the pointer of ptnetmap_cfg - * (user-space buffer) starting from nr_arg1 field, so that the kernel - * can read it with copyin (copy_from_user). + * Structure filled-in by the kernel when asked for allocator info + * through NETMAP_POOLS_INFO_GET. Used by hypervisors supporting + * ptnetmap. */ +struct netmap_pools_info { + uint64_t memsize; /* same as nmr->nr_memsize */ + uint32_t memid; /* same as nmr->nr_arg2 */ + uint32_t if_pool_offset; + uint32_t if_pool_objtotal; + uint32_t if_pool_objsize; + uint32_t ring_pool_offset; + uint32_t ring_pool_objtotal; + uint32_t ring_pool_objsize; + uint32_t buf_pool_offset; + uint32_t buf_pool_objtotal; + uint32_t buf_pool_objsize; +}; + +/* + * Pass a pointer to a userspace buffer to be passed to kernelspace for write + * or read. Used by NETMAP_PT_HOST_CREATE and NETMAP_POOLS_INFO_GET. + */ static inline void -ptnetmap_write_cfg(struct nmreq *nmr, struct ptnetmap_cfg *cfg) +nmreq_pointer_put(struct nmreq *nmr, void *userptr) { - uintptr_t *nmr_ptncfg = (uintptr_t *)&nmr->nr_arg1; - *nmr_ptncfg = (uintptr_t)cfg; + uintptr_t *pp = (uintptr_t *)&nmr->nr_arg1; + *pp = (uintptr_t)userptr; } -/* ptnetmap control commands */ -#define PTNETMAP_PTCTL_CONFIG 1 -#define PTNETMAP_PTCTL_FINALIZE 2 -#define PTNETMAP_PTCTL_IFNEW 3 -#define PTNETMAP_PTCTL_IFDELETE 4 -#define PTNETMAP_PTCTL_RINGSCREATE 5 -#define PTNETMAP_PTCTL_RINGSDELETE 6 -#define PTNETMAP_PTCTL_DEREF 7 -#define PTNETMAP_PTCTL_TXSYNC 8 -#define PTNETMAP_PTCTL_RXSYNC 9 -#define PTNETMAP_PTCTL_REGIF 10 -#define PTNETMAP_PTCTL_UNREGIF 11 -#define PTNETMAP_PTCTL_HOSTMEMID 12 +/* ptnetmap features */ +#define PTNETMAP_F_VNET_HDR 1 - /* I/O registers for the ptnet device. */ #define PTNET_IO_PTFEAT 0 #define PTNET_IO_PTCTL 4 -#define PTNET_IO_PTSTS 8 -#define PTNET_IO_MAC_LO 12 -#define PTNET_IO_MAC_HI 16 -#define PTNET_IO_CSBBAH 20 -#define PTNET_IO_CSBBAL 24 -#define PTNET_IO_NIFP_OFS 28 -#define PTNET_IO_NUM_TX_RINGS 32 -#define PTNET_IO_NUM_RX_RINGS 36 -#define PTNET_IO_NUM_TX_SLOTS 40 -#define PTNET_IO_NUM_RX_SLOTS 44 -#define PTNET_IO_VNET_HDR_LEN 48 +#define PTNET_IO_MAC_LO 8 +#define PTNET_IO_MAC_HI 12 +#define PTNET_IO_CSBBAH 16 +#define PTNET_IO_CSBBAL 20 +#define PTNET_IO_NIFP_OFS 24 +#define PTNET_IO_NUM_TX_RINGS 28 +#define PTNET_IO_NUM_RX_RINGS 32 +#define PTNET_IO_NUM_TX_SLOTS 36 +#define PTNET_IO_NUM_RX_SLOTS 40 +#define PTNET_IO_VNET_HDR_LEN 44 +#define PTNET_IO_HOSTMEMID 48 #define PTNET_IO_END 52 #define PTNET_IO_KICK_BASE 128 -#define PTNET_IO_MASK 0xff +#define PTNET_IO_MASK 0xff +/* ptnetmap control commands (values for PTCTL register) */ +#define PTNETMAP_PTCTL_CREATE 1 +#define PTNETMAP_PTCTL_DELETE 2 + /* If defined, CSB is allocated by the guest, not by the host. */ #define PTNET_CSB_ALLOC /* ptnetmap ring fields shared between guest and host */ struct ptnet_ring { /* XXX revise the layout to minimize cache bounces. */ uint32_t head; /* GW+ HR+ the head of the guest netmap_ring */ uint32_t cur; /* GW+ HR+ the cur of the guest netmap_ring */ uint32_t guest_need_kick; /* GW+ HR+ host-->guest notification enable */ uint32_t sync_flags; /* GW+ HR+ the flags of the guest [tx|rx]sync() */ uint32_t hwcur; /* GR+ HW+ the hwcur of the host netmap_kring */ uint32_t hwtail; /* GR+ HW+ the hwtail of the host netmap_kring */ uint32_t host_need_kick; /* GR+ HW+ guest-->host notification enable */ char pad[4]; }; /* CSB for the ptnet device. */ struct ptnet_csb { +#define NETMAP_VIRT_CSB_SIZE 4096 struct ptnet_ring rings[NETMAP_VIRT_CSB_SIZE/sizeof(struct ptnet_ring)]; }; -#if defined (WITH_PTNETMAP_HOST) || defined (WITH_PTNETMAP_GUEST) - -/* return l_elem - r_elem with wraparound */ -static inline uint32_t -ptn_sub(uint32_t l_elem, uint32_t r_elem, uint32_t num_slots) -{ - int64_t res; - - res = (int64_t)(l_elem) - r_elem; - - return (res < 0) ? res + num_slots : res; -} -#endif /* WITH_PTNETMAP_HOST || WITH_PTNETMAP_GUEST */ - #ifdef WITH_PTNETMAP_GUEST /* ptnetmap_memdev routines used to talk with ptnetmap_memdev device driver */ struct ptnetmap_memdev; -int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *, vm_paddr_t *, void **); +int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *, vm_paddr_t *, void **, + uint64_t *); void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *); +uint32_t nm_os_pt_memdev_ioread(struct ptnetmap_memdev *, unsigned int); /* Guest driver: Write kring pointers (cur, head) to the CSB. * This routine is coupled with ptnetmap_host_read_kring_csb(). */ static inline void ptnetmap_guest_write_kring_csb(struct ptnet_ring *ptr, uint32_t cur, uint32_t head) { /* * We need to write cur and head to the CSB but we cannot do it atomically. * There is no way we can prevent the host from reading the updated value * of one of the two and the old value of the other. However, if we make * sure that the host never reads a value of head more recent than the * value of cur we are safe. We can allow the host to read a value of cur * more recent than the value of head, since in the netmap ring cur can be * ahead of head and cur cannot wrap around head because it must be behind * tail. Inverting the order of writes below could instead result into the * host to think head went ahead of cur, which would cause the sync * prologue to fail. * * The following memory barrier scheme is used to make this happen: * * Guest Host * * STORE(cur) LOAD(head) * mb() <-----------> mb() * STORE(head) LOAD(cur) */ ptr->cur = cur; mb(); ptr->head = head; } /* Guest driver: Read kring pointers (hwcur, hwtail) from the CSB. * This routine is coupled with ptnetmap_host_write_kring_csb(). */ static inline void ptnetmap_guest_read_kring_csb(struct ptnet_ring *ptr, struct netmap_kring *kring) { /* * We place a memory barrier to make sure that the update of hwtail never * overtakes the update of hwcur. * (see explanation in ptnetmap_host_write_kring_csb). */ kring->nr_hwtail = ptr->hwtail; mb(); kring->nr_hwcur = ptr->hwcur; } #endif /* WITH_PTNETMAP_GUEST */ #ifdef WITH_PTNETMAP_HOST /* * ptnetmap kernel thread routines * */ /* Functions to read and write CSB fields in the host */ #if defined (linux) #define CSB_READ(csb, field, r) (get_user(r, &csb->field)) #define CSB_WRITE(csb, field, v) (put_user(v, &csb->field)) #else /* ! linux */ #define CSB_READ(csb, field, r) (r = fuword32(&csb->field)) #define CSB_WRITE(csb, field, v) (suword32(&csb->field, v)) #endif /* ! linux */ /* Host netmap: Write kring pointers (hwcur, hwtail) to the CSB. * This routine is coupled with ptnetmap_guest_read_kring_csb(). */ static inline void ptnetmap_host_write_kring_csb(struct ptnet_ring __user *ptr, uint32_t hwcur, uint32_t hwtail) { /* * The same scheme used in ptnetmap_guest_write_kring_csb() applies here. * We allow the guest to read a value of hwcur more recent than the value * of hwtail, since this would anyway result in a consistent view of the * ring state (and hwcur can never wraparound hwtail, since hwcur must be * behind head). * * The following memory barrier scheme is used to make this happen: * * Guest Host * * STORE(hwcur) LOAD(hwtail) * mb() <-------------> mb() * STORE(hwtail) LOAD(hwcur) */ CSB_WRITE(ptr, hwcur, hwcur); mb(); CSB_WRITE(ptr, hwtail, hwtail); } /* Host netmap: Read kring pointers (head, cur, sync_flags) from the CSB. * This routine is coupled with ptnetmap_guest_write_kring_csb(). */ static inline void ptnetmap_host_read_kring_csb(struct ptnet_ring __user *ptr, struct netmap_ring *shadow_ring, uint32_t num_slots) { /* * We place a memory barrier to make sure that the update of head never * overtakes the update of cur. * (see explanation in ptnetmap_guest_write_kring_csb). */ CSB_READ(ptr, head, shadow_ring->head); mb(); CSB_READ(ptr, cur, shadow_ring->cur); CSB_READ(ptr, sync_flags, shadow_ring->flags); } #endif /* WITH_PTNETMAP_HOST */ #endif /* NETMAP_VIRT_H */ Index: user/alc/PQ_LAUNDRY/sys/net/rndis.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/net/rndis.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/net/rndis.h (revision 308054) @@ -1,381 +1,382 @@ /* $FreeBSD$ */ /* $OpenBSD: if_urndisreg.h,v 1.19 2013/11/21 14:08:05 mpi Exp $ */ /* * Copyright (c) 2010 Jonathan Armani * Copyright (c) 2010 Fabien Romano * Copyright (c) 2010 Michael Knudsen * All rights reserved. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #ifndef _NET_RNDIS_H_ #define _NET_RNDIS_H_ /* Canonical major/minor version as of 22th Aug. 2016. */ #define RNDIS_VERSION_MAJOR 0x00000001 #define RNDIS_VERSION_MINOR 0x00000000 #define RNDIS_STATUS_SUCCESS 0x00000000L #define RNDIS_STATUS_PENDING 0x00000103L #define RNDIS_STATUS_MEDIA_CONNECT 0x4001000BL #define RNDIS_STATUS_MEDIA_DISCONNECT 0x4001000CL #define RNDIS_STATUS_NETWORK_CHANGE 0x40010018L #define RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG 0x40020006L #define RNDIS_STATUS_BUFFER_OVERFLOW 0x80000005L #define RNDIS_STATUS_FAILURE 0xC0000001L #define RNDIS_STATUS_NOT_SUPPORTED 0xC00000BBL #define RNDIS_STATUS_RESOURCES 0xC000009AL #define RNDIS_STATUS_INVALID_DATA 0xC0010015L #define OID_GEN_SUPPORTED_LIST 0x00010101 #define OID_GEN_HARDWARE_STATUS 0x00010102 #define OID_GEN_MEDIA_SUPPORTED 0x00010103 #define OID_GEN_MEDIA_IN_USE 0x00010104 #define OID_GEN_MAXIMUM_LOOKAHEAD 0x00010105 #define OID_GEN_MAXIMUM_FRAME_SIZE 0x00010106 #define OID_GEN_LINK_SPEED 0x00010107 #define OID_GEN_TRANSMIT_BUFFER_SPACE 0x00010108 #define OID_GEN_RECEIVE_BUFFER_SPACE 0x00010109 #define OID_GEN_TRANSMIT_BLOCK_SIZE 0x0001010A #define OID_GEN_RECEIVE_BLOCK_SIZE 0x0001010B #define OID_GEN_VENDOR_ID 0x0001010C #define OID_GEN_VENDOR_DESCRIPTION 0x0001010D #define OID_GEN_CURRENT_PACKET_FILTER 0x0001010E #define OID_GEN_CURRENT_LOOKAHEAD 0x0001010F #define OID_GEN_DRIVER_VERSION 0x00010110 #define OID_GEN_MAXIMUM_TOTAL_SIZE 0x00010111 #define OID_GEN_PROTOCOL_OPTIONS 0x00010112 #define OID_GEN_MAC_OPTIONS 0x00010113 #define OID_GEN_MEDIA_CONNECT_STATUS 0x00010114 #define OID_GEN_MAXIMUM_SEND_PACKETS 0x00010115 #define OID_GEN_VENDOR_DRIVER_VERSION 0x00010116 #define OID_GEN_SUPPORTED_GUIDS 0x00010117 #define OID_GEN_NETWORK_LAYER_ADDRESSES 0x00010118 #define OID_GEN_TRANSPORT_HEADER_OFFSET 0x00010119 #define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203 #define OID_GEN_RECEIVE_SCALE_PARAMETERS 0x00010204 #define OID_GEN_MACHINE_NAME 0x0001021A #define OID_GEN_RNDIS_CONFIG_PARAMETER 0x0001021B #define OID_GEN_VLAN_ID 0x0001021C #define OID_802_3_PERMANENT_ADDRESS 0x01010101 #define OID_802_3_CURRENT_ADDRESS 0x01010102 #define OID_802_3_MULTICAST_LIST 0x01010103 #define OID_802_3_MAXIMUM_LIST_SIZE 0x01010104 #define OID_802_3_MAC_OPTIONS 0x01010105 #define OID_802_3_RCV_ERROR_ALIGNMENT 0x01020101 #define OID_802_3_XMIT_ONE_COLLISION 0x01020102 #define OID_802_3_XMIT_MORE_COLLISIONS 0x01020103 #define OID_802_3_XMIT_DEFERRED 0x01020201 #define OID_802_3_XMIT_MAX_COLLISIONS 0x01020202 #define OID_802_3_RCV_OVERRUN 0x01020203 #define OID_802_3_XMIT_UNDERRUN 0x01020204 #define OID_802_3_XMIT_HEARTBEAT_FAILURE 0x01020205 #define OID_802_3_XMIT_TIMES_CRS_LOST 0x01020206 #define OID_802_3_XMIT_LATE_COLLISIONS 0x01020207 #define OID_TCP_OFFLOAD_PARAMETERS 0xFC01020C #define OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES 0xFC01020D #define RNDIS_MEDIUM_802_3 0x00000000 /* Device flags */ #define RNDIS_DF_CONNECTIONLESS 0x00000001 #define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 /* * Common RNDIS message header. */ struct rndis_msghdr { uint32_t rm_type; uint32_t rm_len; }; /* * RNDIS data message */ #define REMOTE_NDIS_PACKET_MSG 0x00000001 struct rndis_packet_msg { uint32_t rm_type; uint32_t rm_len; uint32_t rm_dataoffset; uint32_t rm_datalen; uint32_t rm_oobdataoffset; uint32_t rm_oobdatalen; uint32_t rm_oobdataelements; uint32_t rm_pktinfooffset; uint32_t rm_pktinfolen; uint32_t rm_vchandle; uint32_t rm_reserved; }; /* * Minimum value for rm_dataoffset, rm_oobdataoffset, and * rm_pktinfooffset. */ #define RNDIS_PACKET_MSG_OFFSET_MIN \ (sizeof(struct rndis_packet_msg) - \ __offsetof(struct rndis_packet_msg, rm_dataoffset)) /* Offset from the beginning of rndis_packet_msg. */ #define RNDIS_PACKET_MSG_OFFSET_ABS(ofs) \ ((ofs) + __offsetof(struct rndis_packet_msg, rm_dataoffset)) #define RNDIS_PACKET_MSG_OFFSET_ALIGN 4 #define RNDIS_PACKET_MSG_OFFSET_ALIGNMASK \ (RNDIS_PACKET_MSG_OFFSET_ALIGN - 1) /* Per-packet-info for RNDIS data message */ struct rndis_pktinfo { uint32_t rm_size; uint32_t rm_type; /* NDIS_PKTINFO_TYPE_ */ uint32_t rm_pktinfooffset; uint8_t rm_data[]; }; #define RNDIS_PKTINFO_OFFSET \ __offsetof(struct rndis_pktinfo, rm_data[0]) #define RNDIS_PKTINFO_SIZE_ALIGN 4 #define RNDIS_PKTINFO_SIZE_ALIGNMASK (RNDIS_PKTINFO_SIZE_ALIGN - 1) #define NDIS_PKTINFO_TYPE_CSUM 0 #define NDIS_PKTINFO_TYPE_IPSEC 1 #define NDIS_PKTINFO_TYPE_LSO 2 #define NDIS_PKTINFO_TYPE_CLASSIFY 3 /* reserved 4 */ #define NDIS_PKTINFO_TYPE_SGLIST 5 #define NDIS_PKTINFO_TYPE_VLAN 6 #define NDIS_PKTINFO_TYPE_ORIG 7 #define NDIS_PKTINFO_TYPE_PKT_CANCELID 8 #define NDIS_PKTINFO_TYPE_ORIG_NBLIST 9 #define NDIS_PKTINFO_TYPE_CACHE_NBLIST 10 #define NDIS_PKTINFO_TYPE_PKT_PAD 11 /* * RNDIS control messages */ /* * Common header for RNDIS completion messages. * * NOTE: It does not apply to REMOTE_NDIS_RESET_CMPLT. */ struct rndis_comp_hdr { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_status; }; /* Initialize the device. */ #define REMOTE_NDIS_INITIALIZE_MSG 0x00000002 #define REMOTE_NDIS_INITIALIZE_CMPLT 0x80000002 struct rndis_init_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_ver_major; uint32_t rm_ver_minor; uint32_t rm_max_xfersz; }; struct rndis_init_comp { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_status; uint32_t rm_ver_major; uint32_t rm_ver_minor; uint32_t rm_devflags; uint32_t rm_medium; uint32_t rm_pktmaxcnt; uint32_t rm_pktmaxsz; uint32_t rm_align; uint32_t rm_aflistoffset; uint32_t rm_aflistsz; }; #define RNDIS_INIT_COMP_SIZE_MIN \ __offsetof(struct rndis_init_comp, rm_aflistsz) /* Halt the device. No response sent. */ #define REMOTE_NDIS_HALT_MSG 0x00000003 struct rndis_halt_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; }; /* Send a query object. */ #define REMOTE_NDIS_QUERY_MSG 0x00000004 #define REMOTE_NDIS_QUERY_CMPLT 0x80000004 struct rndis_query_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_oid; uint32_t rm_infobuflen; uint32_t rm_infobufoffset; uint32_t rm_devicevchdl; }; #define RNDIS_QUERY_REQ_INFOBUFOFFSET \ (sizeof(struct rndis_query_req) - \ __offsetof(struct rndis_query_req, rm_rid)) struct rndis_query_comp { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_status; uint32_t rm_infobuflen; uint32_t rm_infobufoffset; }; /* infobuf offset from the beginning of rndis_query_comp. */ #define RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(ofs) \ ((ofs) + __offsetof(struct rndis_query_req, rm_rid)) /* Send a set object request. */ #define REMOTE_NDIS_SET_MSG 0x00000005 #define REMOTE_NDIS_SET_CMPLT 0x80000005 struct rndis_set_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_oid; uint32_t rm_infobuflen; uint32_t rm_infobufoffset; uint32_t rm_devicevchdl; }; #define RNDIS_SET_REQ_INFOBUFOFFSET \ (sizeof(struct rndis_set_req) - \ __offsetof(struct rndis_set_req, rm_rid)) struct rndis_set_comp { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_status; }; /* * Parameter used by OID_GEN_RNDIS_CONFIG_PARAMETER. */ #define REMOTE_NDIS_SET_PARAM_NUMERIC 0x00000000 #define REMOTE_NDIS_SET_PARAM_STRING 0x00000002 struct rndis_set_parameter { uint32_t rm_nameoffset; uint32_t rm_namelen; uint32_t rm_type; uint32_t rm_valueoffset; uint32_t rm_valuelen; }; /* Perform a soft reset on the device. */ #define REMOTE_NDIS_RESET_MSG 0x00000006 #define REMOTE_NDIS_RESET_CMPLT 0x80000006 struct rndis_reset_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; }; struct rndis_reset_comp { uint32_t rm_type; uint32_t rm_len; uint32_t rm_status; uint32_t rm_adrreset; }; /* 802.3 link-state or undefined message error. Sent by device. */ #define REMOTE_NDIS_INDICATE_STATUS_MSG 0x00000007 struct rndis_status_msg { uint32_t rm_type; uint32_t rm_len; uint32_t rm_status; uint32_t rm_stbuflen; uint32_t rm_stbufoffset; /* rndis_diag_info */ }; /* stbuf offset from the beginning of rndis_status_msg. */ #define RNDIS_STBUFOFFSET_ABS(ofs) \ ((ofs) + __offsetof(struct rndis_status_msg, rm_status)) /* * Immediately after rndis_status_msg.rm_stbufoffset, if a control * message is malformatted, or a packet message contains inappropriate * content. */ struct rndis_diag_info { uint32_t rm_diagstatus; uint32_t rm_erroffset; }; /* Keepalive messsage. May be sent by device. */ #define REMOTE_NDIS_KEEPALIVE_MSG 0x00000008 #define REMOTE_NDIS_KEEPALIVE_CMPLT 0x80000008 struct rndis_keepalive_req { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; }; struct rndis_keepalive_comp { uint32_t rm_type; uint32_t rm_len; uint32_t rm_rid; uint32_t rm_status; }; /* Packet filter bits used by OID_GEN_CURRENT_PACKET_FILTER */ +#define NDIS_PACKET_TYPE_NONE 0x00000000 #define NDIS_PACKET_TYPE_DIRECTED 0x00000001 #define NDIS_PACKET_TYPE_MULTICAST 0x00000002 #define NDIS_PACKET_TYPE_ALL_MULTICAST 0x00000004 #define NDIS_PACKET_TYPE_BROADCAST 0x00000008 #define NDIS_PACKET_TYPE_SOURCE_ROUTING 0x00000010 #define NDIS_PACKET_TYPE_PROMISCUOUS 0x00000020 #define NDIS_PACKET_TYPE_SMT 0x00000040 #define NDIS_PACKET_TYPE_ALL_LOCAL 0x00000080 #define NDIS_PACKET_TYPE_GROUP 0x00001000 #define NDIS_PACKET_TYPE_ALL_FUNCTIONAL 0x00002000 #define NDIS_PACKET_TYPE_FUNCTIONAL 0x00004000 #define NDIS_PACKET_TYPE_MAC_FRAME 0x00008000 /* * Packet filter description for use with printf(9) %b identifier. */ #define NDIS_PACKET_TYPES \ "\20\1DIRECT\2MULTICAST\3ALLMULTI\4BROADCAST" \ "\5SRCROUTE\6PROMISC\7SMT\10ALLLOCAL" \ "\11GROUP\12ALLFUNC\13FUNC\14MACFRAME" /* RNDIS offsets */ #define RNDIS_HEADER_OFFSET ((uint32_t)sizeof(struct rndis_msghdr)) #define RNDIS_DATA_OFFSET \ ((uint32_t)(sizeof(struct rndis_packet_msg) - RNDIS_HEADER_OFFSET)) #endif /* !_NET_RNDIS_H_ */ Index: user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan.c (revision 308054) @@ -1,667 +1,670 @@ /*- * Copyright (c) 2002-2008 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * IEEE 802.11 scanning support. */ #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include /* XXX until it's implemented as attach ops */ #include #include /* * Roaming-related defaults. RSSI thresholds are as returned by the * driver (.5dBm). Transmit rate thresholds are IEEE rate codes (i.e * .5M units) or MCS. */ /* rssi thresholds */ #define ROAM_RSSI_11A_DEFAULT 14 /* 11a bss */ #define ROAM_RSSI_11B_DEFAULT 14 /* 11b bss */ #define ROAM_RSSI_11BONLY_DEFAULT 14 /* 11b-only bss */ /* transmit rate thresholds */ #define ROAM_RATE_11A_DEFAULT 2*12 /* 11a bss */ #define ROAM_RATE_11B_DEFAULT 2*5 /* 11b bss */ #define ROAM_RATE_11BONLY_DEFAULT 2*1 /* 11b-only bss */ #define ROAM_RATE_HALF_DEFAULT 2*6 /* half-width 11a/g bss */ #define ROAM_RATE_QUARTER_DEFAULT 2*3 /* quarter-width 11a/g bss */ #define ROAM_MCS_11N_DEFAULT (1 | IEEE80211_RATE_MCS) /* 11n bss */ void ieee80211_scan_attach(struct ieee80211com *ic) { /* * If there's no scan method pointer, attach the * swscan set as a default. */ if (ic->ic_scan_methods == NULL) ieee80211_swscan_attach(ic); else ic->ic_scan_methods->sc_attach(ic); } void ieee80211_scan_detach(struct ieee80211com *ic) { /* * Ideally we'd do the ss_ops detach call here; * but then sc_detach() would need to be split in two. * * I'll do that later. */ ic->ic_scan_methods->sc_detach(ic); } static const struct ieee80211_roamparam defroam[IEEE80211_MODE_MAX] = { [IEEE80211_MODE_11A] = { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_11A_DEFAULT }, [IEEE80211_MODE_11G] = { .rssi = ROAM_RSSI_11B_DEFAULT, .rate = ROAM_RATE_11B_DEFAULT }, [IEEE80211_MODE_11B] = { .rssi = ROAM_RSSI_11BONLY_DEFAULT, .rate = ROAM_RATE_11BONLY_DEFAULT }, [IEEE80211_MODE_TURBO_A]= { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_11A_DEFAULT }, [IEEE80211_MODE_TURBO_G]= { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_11A_DEFAULT }, [IEEE80211_MODE_STURBO_A]={ .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_11A_DEFAULT }, [IEEE80211_MODE_HALF] = { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_HALF_DEFAULT }, [IEEE80211_MODE_QUARTER]= { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_RATE_QUARTER_DEFAULT }, [IEEE80211_MODE_11NA] = { .rssi = ROAM_RSSI_11A_DEFAULT, .rate = ROAM_MCS_11N_DEFAULT }, [IEEE80211_MODE_11NG] = { .rssi = ROAM_RSSI_11B_DEFAULT, .rate = ROAM_MCS_11N_DEFAULT }, }; void ieee80211_scan_vattach(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; vap->iv_bgscanidle = (IEEE80211_BGSCAN_IDLE_DEFAULT*1000)/hz; vap->iv_bgscanintvl = IEEE80211_BGSCAN_INTVAL_DEFAULT*hz; vap->iv_scanvalid = IEEE80211_SCAN_VALID_DEFAULT*hz; vap->iv_roaming = IEEE80211_ROAMING_AUTO; memcpy(vap->iv_roamparms, defroam, sizeof(defroam)); ic->ic_scan_methods->sc_vattach(vap); } void ieee80211_scan_vdetach(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss; IEEE80211_LOCK(ic); ss = ic->ic_scan; ic->ic_scan_methods->sc_vdetach(vap); if (ss != NULL && ss->ss_vap == vap) { if (ss->ss_ops != NULL) { ss->ss_ops->scan_detach(ss); ss->ss_ops = NULL; } ss->ss_vap = NULL; } IEEE80211_UNLOCK(ic); } /* * Simple-minded scanner module support. */ static const char *scan_modnames[IEEE80211_OPMODE_MAX] = { "wlan_scan_sta", /* IEEE80211_M_IBSS */ "wlan_scan_sta", /* IEEE80211_M_STA */ "wlan_scan_wds", /* IEEE80211_M_WDS */ "wlan_scan_sta", /* IEEE80211_M_AHDEMO */ "wlan_scan_ap", /* IEEE80211_M_HOSTAP */ "wlan_scan_monitor", /* IEEE80211_M_MONITOR */ "wlan_scan_sta", /* IEEE80211_M_MBSS */ }; static const struct ieee80211_scanner *scanners[IEEE80211_OPMODE_MAX]; const struct ieee80211_scanner * ieee80211_scanner_get(enum ieee80211_opmode mode) { if (mode >= IEEE80211_OPMODE_MAX) return NULL; if (scanners[mode] == NULL) ieee80211_load_module(scan_modnames[mode]); return scanners[mode]; } void ieee80211_scanner_register(enum ieee80211_opmode mode, const struct ieee80211_scanner *scan) { if (mode >= IEEE80211_OPMODE_MAX) return; scanners[mode] = scan; } void ieee80211_scanner_unregister(enum ieee80211_opmode mode, const struct ieee80211_scanner *scan) { if (mode >= IEEE80211_OPMODE_MAX) return; if (scanners[mode] == scan) scanners[mode] = NULL; } void ieee80211_scanner_unregister_all(const struct ieee80211_scanner *scan) { int m; for (m = 0; m < IEEE80211_OPMODE_MAX; m++) if (scanners[m] == scan) scanners[m] = NULL; } /* * Update common scanner state to reflect the current * operating mode. This is called when the state machine * is transitioned to RUN state w/o scanning--e.g. when * operating in monitor mode. The purpose of this is to * ensure later callbacks find ss_ops set to properly * reflect current operating mode. */ void ieee80211_scan_update_locked(struct ieee80211vap *vap, const struct ieee80211_scanner *scan) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); #ifdef IEEE80211_DEBUG if (ss->ss_vap != vap || ss->ss_ops != scan) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: current scanner is <%s:%s>, switch to <%s:%s>\n", __func__, ss->ss_vap != NULL ? ss->ss_vap->iv_ifp->if_xname : "none", ss->ss_vap != NULL ? ieee80211_opmode_name[ss->ss_vap->iv_opmode] : "none", vap->iv_ifp->if_xname, ieee80211_opmode_name[vap->iv_opmode]); } #endif ss->ss_vap = vap; if (ss->ss_ops != scan) { /* * Switch scanners; detach old, attach new. Special * case where a single scan module implements multiple * policies by using different scan ops but a common * core. We assume if the old and new attach methods * are identical then it's ok to just change ss_ops * and not flush the internal state of the module. */ if (scan == NULL || ss->ss_ops == NULL || ss->ss_ops->scan_attach != scan->scan_attach) { if (ss->ss_ops != NULL) ss->ss_ops->scan_detach(ss); if (scan != NULL && !scan->scan_attach(ss)) { /* XXX attach failure */ /* XXX stat+msg */ scan = NULL; } } ss->ss_ops = scan; } } void ieee80211_scan_dump_channels(const struct ieee80211_scan_state *ss) { struct ieee80211com *ic = ss->ss_ic; const char *sep; int i; sep = ""; for (i = ss->ss_next; i < ss->ss_last; i++) { const struct ieee80211_channel *c = ss->ss_chans[i]; printf("%s%u%c", sep, ieee80211_chan2ieee(ic, c), ieee80211_channel_type_char(c)); sep = ", "; } } #ifdef IEEE80211_DEBUG void ieee80211_scan_dump(struct ieee80211_scan_state *ss) { struct ieee80211vap *vap = ss->ss_vap; if_printf(vap->iv_ifp, "scan set "); ieee80211_scan_dump_channels(ss); printf(" dwell min %lums max %lums\n", ticks_to_msecs(ss->ss_mindwell), ticks_to_msecs(ss->ss_maxdwell)); } #endif /* IEEE80211_DEBUG */ void ieee80211_scan_copy_ssid(struct ieee80211vap *vap, struct ieee80211_scan_state *ss, int nssid, const struct ieee80211_scan_ssid ssids[]) { if (nssid > IEEE80211_SCAN_MAX_SSID) { /* XXX printf */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: too many ssid %d, ignoring all of them\n", __func__, nssid); return; } memcpy(ss->ss_ssid, ssids, nssid * sizeof(ssids[0])); ss->ss_nssid = nssid; } /* * Start a scan unless one is already going. */ int ieee80211_start_scan(struct ieee80211vap *vap, int flags, u_int duration, u_int mindwell, u_int maxdwell, u_int nssid, const struct ieee80211_scan_ssid ssids[]) { const struct ieee80211_scanner *scan; struct ieee80211com *ic = vap->iv_ic; scan = ieee80211_scanner_get(vap->iv_opmode); if (scan == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: no scanner support for %s mode\n", __func__, ieee80211_opmode_name[vap->iv_opmode]); /* XXX stat */ return 0; } return ic->ic_scan_methods->sc_start_scan(scan, vap, flags, duration, mindwell, maxdwell, nssid, ssids); } /* * Check the scan cache for an ap/channel to use; if that * fails then kick off a new scan. */ int ieee80211_check_scan(struct ieee80211vap *vap, int flags, u_int duration, u_int mindwell, u_int maxdwell, u_int nssid, const struct ieee80211_scan_ssid ssids[]) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; const struct ieee80211_scanner *scan; int result; scan = ieee80211_scanner_get(vap->iv_opmode); if (scan == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: no scanner support for %s mode\n", __func__, vap->iv_opmode); /* XXX stat */ return 0; } /* * Check if there's a list of scan candidates already. * XXX want more than the ap we're currently associated with */ IEEE80211_LOCK(ic); IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s scan, %s%s%s%s%s\n" , __func__ , flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive" , flags & IEEE80211_SCAN_FLUSH ? "flush" : "append" , flags & IEEE80211_SCAN_NOPICK ? ", nopick" : "" , flags & IEEE80211_SCAN_NOJOIN ? ", nojoin" : "" , flags & IEEE80211_SCAN_PICK1ST ? ", pick1st" : "" , flags & IEEE80211_SCAN_ONCE ? ", once" : "" ); if (ss->ss_ops != scan) { /* XXX re-use cache contents? e.g. adhoc<->sta */ flags |= IEEE80211_SCAN_FLUSH; } /* * XXX TODO: separate things out a bit better. */ ieee80211_scan_update_locked(vap, scan); result = ic->ic_scan_methods->sc_check_scan(scan, vap, flags, duration, mindwell, maxdwell, nssid, ssids); IEEE80211_UNLOCK(ic); return (result); } /* * Check the scan cache for an ap/channel to use; if that fails * then kick off a scan using the current settings. */ int ieee80211_check_scan_current(struct ieee80211vap *vap) { return ieee80211_check_scan(vap, IEEE80211_SCAN_ACTIVE, IEEE80211_SCAN_FOREVER, 0, 0, vap->iv_des_nssid, vap->iv_des_ssid); } /* * Restart a previous scan. If the previous scan completed * then we start again using the existing channel list. */ int ieee80211_bg_scan(struct ieee80211vap *vap, int flags) { struct ieee80211com *ic = vap->iv_ic; const struct ieee80211_scanner *scan; // IEEE80211_UNLOCK_ASSERT(sc); scan = ieee80211_scanner_get(vap->iv_opmode); if (scan == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: no scanner support for %s mode\n", __func__, vap->iv_opmode); /* XXX stat */ return 0; } /* * XXX TODO: pull apart the bgscan logic into whatever * belongs here and whatever belongs in the software * scanner. */ return (ic->ic_scan_methods->sc_bg_scan(scan, vap, flags)); } /* * Cancel any scan currently going on for the specified vap. */ void ieee80211_cancel_scan(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; ic->ic_scan_methods->sc_cancel_scan(vap); } /* * Cancel any scan currently going on. + * + * This is called during normal 802.11 data path to cancel + * a scan so a newly arrived normal data packet can be sent. */ void ieee80211_cancel_anyscan(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; ic->ic_scan_methods->sc_cancel_anyscan(vap); } /* * Manually switch to the next channel in the channel list. * Provided for drivers that manage scanning themselves * (e.g. for firmware-based devices). */ void ieee80211_scan_next(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; ic->ic_scan_methods->sc_scan_next(vap); } /* * Manually stop a scan that is currently running. * Provided for drivers that are not able to scan single channels * (e.g. for firmware-based devices). */ void ieee80211_scan_done(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: called\n", __func__); IEEE80211_LOCK(ic); ss = ic->ic_scan; ss->ss_next = ss->ss_last; /* all channels are complete */ ic->ic_scan_methods->sc_scan_done(vap); IEEE80211_UNLOCK(ic); } /* * Probe the current channel, if allowed, while scanning. * If the channel is not marked passive-only then send * a probe request immediately. Otherwise mark state and * listen for beacons on the channel; if we receive something * then we'll transmit a probe request. */ void ieee80211_probe_curchan(struct ieee80211vap *vap, int force) { struct ieee80211com *ic = vap->iv_ic; if ((ic->ic_curchan->ic_flags & IEEE80211_CHAN_PASSIVE) && !force) { ic->ic_flags_ext |= IEEE80211_FEXT_PROBECHAN; return; } ic->ic_scan_methods->sc_scan_probe_curchan(vap, force); } #ifdef IEEE80211_DEBUG static void dump_country(const uint8_t *ie) { const struct ieee80211_country_ie *cie = (const struct ieee80211_country_ie *) ie; int i, nbands, schan, nchan; if (cie->len < 3) { printf(" ", cie->len); return; } printf(" country [%c%c%c", cie->cc[0], cie->cc[1], cie->cc[2]); nbands = (cie->len - 3) / sizeof(cie->band[0]); for (i = 0; i < nbands; i++) { schan = cie->band[i].schan; nchan = cie->band[i].nchan; if (nchan != 1) printf(" %u-%u,%u", schan, schan + nchan-1, cie->band[i].maxtxpwr); else printf(" %u,%u", schan, cie->band[i].maxtxpwr); } printf("]"); } void ieee80211_scan_dump_probe_beacon(uint8_t subtype, int isnew, const uint8_t mac[IEEE80211_ADDR_LEN], const struct ieee80211_scanparams *sp, int rssi) { printf("[%s] %s%s on chan %u (bss chan %u) ", ether_sprintf(mac), isnew ? "new " : "", ieee80211_mgt_subtype_name(subtype), sp->chan, sp->bchan); ieee80211_print_essid(sp->ssid + 2, sp->ssid[1]); printf(" rssi %d\n", rssi); if (isnew) { printf("[%s] caps 0x%x bintval %u erp 0x%x", ether_sprintf(mac), sp->capinfo, sp->bintval, sp->erp); if (sp->country != NULL) dump_country(sp->country); printf("\n"); } } #endif /* IEEE80211_DEBUG */ /* * Process a beacon or probe response frame. */ void ieee80211_add_scan(struct ieee80211vap *vap, struct ieee80211_channel *curchan, const struct ieee80211_scanparams *sp, const struct ieee80211_frame *wh, int subtype, int rssi, int noise) { struct ieee80211com *ic = vap->iv_ic; return (ic->ic_scan_methods->sc_add_scan(vap, curchan, sp, wh, subtype, rssi, noise)); } /* * Timeout/age scan cache entries; called from sta timeout * timer (XXX should be self-contained). */ void ieee80211_scan_timeout(struct ieee80211com *ic) { struct ieee80211_scan_state *ss = ic->ic_scan; if (ss->ss_ops != NULL) ss->ss_ops->scan_age(ss); } /* * Mark a scan cache entry after a successful associate. */ void ieee80211_scan_assoc_success(struct ieee80211vap *vap, const uint8_t mac[]) { struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan; if (ss->ss_ops != NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_SCAN, mac, "%s", __func__); ss->ss_ops->scan_assoc_success(ss, mac); } } /* * Demerit a scan cache entry after failing to associate. */ void ieee80211_scan_assoc_fail(struct ieee80211vap *vap, const uint8_t mac[], int reason) { struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan; if (ss->ss_ops != NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_SCAN, mac, "%s: reason %u", __func__, reason); ss->ss_ops->scan_assoc_fail(ss, mac, reason); } } /* * Iterate over the contents of the scan cache. */ void ieee80211_scan_iterate(struct ieee80211vap *vap, ieee80211_scan_iter_func *f, void *arg) { struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan; if (ss->ss_ops != NULL) ss->ss_ops->scan_iterate(ss, f, arg); } /* * Flush the contents of the scan cache. */ void ieee80211_scan_flush(struct ieee80211vap *vap) { struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan; if (ss->ss_ops != NULL && ss->ss_vap == vap) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", __func__); ss->ss_ops->scan_flush(ss); } } /* * Check the scan cache for an ap/channel to use; if that * fails then kick off a new scan. */ struct ieee80211_channel * ieee80211_scan_pickchannel(struct ieee80211com *ic, int flags) { struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); if (ss == NULL || ss->ss_ops == NULL || ss->ss_vap == NULL) { /* XXX printf? */ return NULL; } if (ss->ss_ops->scan_pickchan == NULL) { IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: scan module does not support picking a channel, " "opmode %s\n", __func__, ss->ss_vap->iv_opmode); return NULL; } return ss->ss_ops->scan_pickchan(ss, flags); } Index: user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan_sw.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan_sw.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/net80211/ieee80211_scan_sw.c (revision 308054) @@ -1,1009 +1,1014 @@ /*- * Copyright (c) 2002-2008 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * IEEE 802.11 scanning support. */ #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct scan_state { struct ieee80211_scan_state base; /* public state */ u_int ss_iflags; /* flags used internally */ #define ISCAN_MINDWELL 0x0001 /* min dwell time reached */ #define ISCAN_DISCARD 0x0002 /* discard rx'd frames */ #define ISCAN_INTERRUPT 0x0004 /* interrupt current scan */ #define ISCAN_CANCEL 0x0008 /* cancel current scan */ #define ISCAN_PAUSE (ISCAN_INTERRUPT | ISCAN_CANCEL) #define ISCAN_ABORT 0x0010 /* end the scan immediately */ #define ISCAN_RUNNING 0x0020 /* scan was started */ unsigned long ss_chanmindwell; /* min dwell on curchan */ unsigned long ss_scanend; /* time scan must stop */ u_int ss_duration; /* duration for next scan */ struct task ss_scan_start; /* scan start */ struct timeout_task ss_scan_curchan; /* scan execution */ }; #define SCAN_PRIVATE(ss) ((struct scan_state *) ss) /* * Amount of time to go off-channel during a background * scan. This value should be large enough to catch most * ap's but short enough that we can return on-channel * before our listen interval expires. * * XXX tunable * XXX check against configured listen interval */ #define IEEE80211_SCAN_OFFCHANNEL msecs_to_ticks(150) static void scan_curchan(struct ieee80211_scan_state *, unsigned long); static void scan_mindwell(struct ieee80211_scan_state *); static void scan_signal(struct ieee80211_scan_state *, int); static void scan_signal_locked(struct ieee80211_scan_state *, int); static void scan_start(void *, int); static void scan_curchan_task(void *, int); static void scan_end(struct ieee80211_scan_state *, int); static void scan_done(struct ieee80211_scan_state *, int); MALLOC_DEFINE(M_80211_SCAN, "80211scan", "802.11 scan state"); static void ieee80211_swscan_detach(struct ieee80211com *ic) { struct ieee80211_scan_state *ss = ic->ic_scan; if (ss != NULL) { scan_signal(ss, ISCAN_ABORT); ieee80211_draintask(ic, &SCAN_PRIVATE(ss)->ss_scan_start); taskqueue_drain_timeout(ic->ic_tq, &SCAN_PRIVATE(ss)->ss_scan_curchan); KASSERT((ic->ic_flags & IEEE80211_F_SCAN) == 0, ("scan still running")); /* * For now, do the ss_ops detach here rather * than ieee80211_scan_detach(). * * I'll figure out how to cleanly split things up * at a later date. */ if (ss->ss_ops != NULL) { ss->ss_ops->scan_detach(ss); ss->ss_ops = NULL; } ic->ic_scan = NULL; IEEE80211_FREE(SCAN_PRIVATE(ss), M_80211_SCAN); } } static void ieee80211_swscan_vattach(struct ieee80211vap *vap) { /* nothing to do for now */ /* * TODO: all of the vap scan calls should be methods! */ } static void ieee80211_swscan_vdetach(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); if (ss != NULL && ss->ss_vap == vap && (ic->ic_flags & IEEE80211_F_SCAN)) scan_signal_locked(ss, ISCAN_ABORT); } static void ieee80211_swscan_set_scan_duration(struct ieee80211vap *vap, u_int duration) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); /* NB: flush frames rx'd before 1st channel change */ SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_DISCARD; SCAN_PRIVATE(ss)->ss_duration = duration; } /* * Start a scan unless one is already going. */ static int ieee80211_swscan_start_scan_locked(const struct ieee80211_scanner *scan, struct ieee80211vap *vap, int flags, u_int duration, u_int mindwell, u_int maxdwell, u_int nssid, const struct ieee80211_scan_ssid ssids[]) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); if (ic->ic_flags & IEEE80211_F_CSAPENDING) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: scan inhibited by pending channel change\n", __func__); } else if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s scan, duration %u mindwell %u maxdwell %u, desired mode %s, %s%s%s%s%s%s\n" , __func__ , flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive" , duration, mindwell, maxdwell , ieee80211_phymode_name[vap->iv_des_mode] , flags & IEEE80211_SCAN_FLUSH ? "flush" : "append" , flags & IEEE80211_SCAN_NOPICK ? ", nopick" : "" , flags & IEEE80211_SCAN_NOJOIN ? ", nojoin" : "" , flags & IEEE80211_SCAN_NOBCAST ? ", nobcast" : "" , flags & IEEE80211_SCAN_PICK1ST ? ", pick1st" : "" , flags & IEEE80211_SCAN_ONCE ? ", once" : "" ); ieee80211_scan_update_locked(vap, scan); if (ss->ss_ops != NULL) { if ((flags & IEEE80211_SCAN_NOSSID) == 0) ieee80211_scan_copy_ssid(vap, ss, nssid, ssids); /* NB: top 4 bits for internal use */ ss->ss_flags = flags & 0xfff; if (ss->ss_flags & IEEE80211_SCAN_ACTIVE) vap->iv_stats.is_scan_active++; else vap->iv_stats.is_scan_passive++; if (flags & IEEE80211_SCAN_FLUSH) ss->ss_ops->scan_flush(ss); if (flags & IEEE80211_SCAN_BGSCAN) ic->ic_flags_ext |= IEEE80211_FEXT_BGSCAN; /* Set duration for this particular scan */ ieee80211_swscan_set_scan_duration(vap, duration); ss->ss_next = 0; ss->ss_mindwell = mindwell; ss->ss_maxdwell = maxdwell; /* NB: scan_start must be before the scan runtask */ ss->ss_ops->scan_start(ss, vap); #ifdef IEEE80211_DEBUG if (ieee80211_msg_scan(vap)) ieee80211_scan_dump(ss); #endif /* IEEE80211_DEBUG */ ic->ic_flags |= IEEE80211_F_SCAN; /* Start scan task */ ieee80211_runtask(ic, &SCAN_PRIVATE(ss)->ss_scan_start); } return 1; } else { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s scan already in progress\n", __func__, ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive"); } return 0; } /* * Start a scan unless one is already going. * * Called without the comlock held; grab the comlock as appropriate. */ static int ieee80211_swscan_start_scan(const struct ieee80211_scanner *scan, struct ieee80211vap *vap, int flags, u_int duration, u_int mindwell, u_int maxdwell, u_int nssid, const struct ieee80211_scan_ssid ssids[]) { struct ieee80211com *ic = vap->iv_ic; int result; IEEE80211_UNLOCK_ASSERT(ic); IEEE80211_LOCK(ic); result = ieee80211_swscan_start_scan_locked(scan, vap, flags, duration, mindwell, maxdwell, nssid, ssids); IEEE80211_UNLOCK(ic); return result; } /* * Check the scan cache for an ap/channel to use; if that * fails then kick off a new scan. * * Called with the comlock held. * * XXX TODO: split out! */ static int ieee80211_swscan_check_scan(const struct ieee80211_scanner *scan, struct ieee80211vap *vap, int flags, u_int duration, u_int mindwell, u_int maxdwell, u_int nssid, const struct ieee80211_scan_ssid ssids[]) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; int result; IEEE80211_LOCK_ASSERT(ic); if (ss->ss_ops != NULL) { /* XXX verify ss_ops matches vap->iv_opmode */ if ((flags & IEEE80211_SCAN_NOSSID) == 0) { /* * Update the ssid list and mark flags so if * we call start_scan it doesn't duplicate work. */ ieee80211_scan_copy_ssid(vap, ss, nssid, ssids); flags |= IEEE80211_SCAN_NOSSID; } if ((ic->ic_flags & IEEE80211_F_SCAN) == 0 && (flags & IEEE80211_SCAN_FLUSH) == 0 && ieee80211_time_before(ticks, ic->ic_lastscan + vap->iv_scanvalid)) { /* * We're not currently scanning and the cache is * deemed hot enough to consult. Lock out others * by marking IEEE80211_F_SCAN while we decide if * something is already in the scan cache we can * use. Also discard any frames that might come * in while temporarily marked as scanning. */ SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_DISCARD; ic->ic_flags |= IEEE80211_F_SCAN; /* NB: need to use supplied flags in check */ ss->ss_flags = flags & 0xff; result = ss->ss_ops->scan_end(ss, vap); ic->ic_flags &= ~IEEE80211_F_SCAN; SCAN_PRIVATE(ss)->ss_iflags &= ~ISCAN_DISCARD; if (result) { ieee80211_notify_scan_done(vap); return 1; } } } result = ieee80211_swscan_start_scan_locked(scan, vap, flags, duration, mindwell, maxdwell, nssid, ssids); return result; } /* * Restart a previous scan. If the previous scan completed * then we start again using the existing channel list. */ static int ieee80211_swscan_bg_scan(const struct ieee80211_scanner *scan, struct ieee80211vap *vap, int flags) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; /* XXX assert unlocked? */ // IEEE80211_UNLOCK_ASSERT(ic); IEEE80211_LOCK(ic); if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) { u_int duration; /* * Go off-channel for a fixed interval that is large * enough to catch most ap's but short enough that * we can return on-channel before our listen interval * expires. */ duration = IEEE80211_SCAN_OFFCHANNEL; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s scan, ticks %u duration %u\n", __func__, ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive", ticks, duration); ieee80211_scan_update_locked(vap, scan); if (ss->ss_ops != NULL) { ss->ss_vap = vap; /* * A background scan does not select a new sta; it * just refreshes the scan cache. Also, indicate * the scan logic should follow the beacon schedule: * we go off-channel and scan for a while, then * return to the bss channel to receive a beacon, * then go off-channel again. All during this time * we notify the ap we're in power save mode. When * the scan is complete we leave power save mode. * If any beacon indicates there are frames pending * for us then we drop out of power save mode * (and background scan) automatically by way of the * usual sta power save logic. */ ss->ss_flags |= IEEE80211_SCAN_NOPICK | IEEE80211_SCAN_BGSCAN | flags ; /* if previous scan completed, restart */ if (ss->ss_next >= ss->ss_last) { if (ss->ss_flags & IEEE80211_SCAN_ACTIVE) vap->iv_stats.is_scan_active++; else vap->iv_stats.is_scan_passive++; /* * NB: beware of the scan cache being flushed; * if the channel list is empty use the * scan_start method to populate it. */ ss->ss_next = 0; if (ss->ss_last != 0) ss->ss_ops->scan_restart(ss, vap); else { ss->ss_ops->scan_start(ss, vap); #ifdef IEEE80211_DEBUG if (ieee80211_msg_scan(vap)) ieee80211_scan_dump(ss); #endif /* IEEE80211_DEBUG */ } } ieee80211_swscan_set_scan_duration(vap, duration); ss->ss_maxdwell = duration; ic->ic_flags |= IEEE80211_F_SCAN; ic->ic_flags_ext |= IEEE80211_FEXT_BGSCAN; ieee80211_runtask(ic, &SCAN_PRIVATE(ss)->ss_scan_start); } else { /* XXX msg+stat */ } } else { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s scan already in progress\n", __func__, ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive"); } IEEE80211_UNLOCK(ic); /* NB: racey, does it matter? */ return (ic->ic_flags & IEEE80211_F_SCAN); } /* * Taskqueue work to cancel a scan. * * Note: for offload scan devices, we may want to call into the * driver to try and cancel scanning, however it may not be cancelable. */ static void cancel_scan(struct ieee80211vap *vap, int any, const char *func) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; struct scan_state *ss_priv = SCAN_PRIVATE(ss); int signal; IEEE80211_LOCK(ic); signal = any ? ISCAN_PAUSE : ISCAN_CANCEL; if ((ic->ic_flags & IEEE80211_F_SCAN) && (any || ss->ss_vap == vap) && (ss_priv->ss_iflags & signal) == 0) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s %s scan\n", func, any ? "pause" : "cancel", ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive"); /* clear bg scan NOPICK */ ss->ss_flags &= ~IEEE80211_SCAN_NOPICK; /* mark request and wake up the scan task */ scan_signal_locked(ss, signal); } else { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: called; F_SCAN=%d, vap=%s, signal=%d\n", func, !! (ic->ic_flags & IEEE80211_F_SCAN), (ss->ss_vap == vap ? "match" : "nomatch"), !! (ss_priv->ss_iflags & signal)); } IEEE80211_UNLOCK(ic); } /* * Cancel any scan currently going on for the specified vap. */ static void ieee80211_swscan_cancel_scan(struct ieee80211vap *vap) { cancel_scan(vap, 0, __func__); } /* * Cancel any scan currently going on. */ static void ieee80211_swscan_cancel_anyscan(struct ieee80211vap *vap) { + + /* XXX for now - just don't do this per packet. */ + if (vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) + return; + cancel_scan(vap, 1, __func__); } /* * Manually switch to the next channel in the channel list. * Provided for drivers that manage scanning themselves * (e.g. for firmware-based devices). */ static void ieee80211_swscan_scan_next(struct ieee80211vap *vap) { struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: called\n", __func__); /* wake up the scan task */ scan_signal(ss, 0); } /* * Manually stop a scan that is currently running. * Provided for drivers that are not able to scan single channels * (e.g. for firmware-based devices). */ static void ieee80211_swscan_scan_done(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; IEEE80211_LOCK_ASSERT(ic); scan_signal_locked(ss, 0); } /* * Probe the current channel, if allowed, while scanning. * If the channel is not marked passive-only then send * a probe request immediately. Otherwise mark state and * listen for beacons on the channel; if we receive something * then we'll transmit a probe request. */ static void ieee80211_swscan_probe_curchan(struct ieee80211vap *vap, int force) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; struct ifnet *ifp = vap->iv_ifp; int i; /* * Full-offload scan devices don't require this. */ if (vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) return; /* * Send directed probe requests followed by any * broadcast probe request. * XXX remove dependence on ic/vap->iv_bss */ for (i = 0; i < ss->ss_nssid; i++) ieee80211_send_probereq(vap->iv_bss, vap->iv_myaddr, ifp->if_broadcastaddr, ifp->if_broadcastaddr, ss->ss_ssid[i].ssid, ss->ss_ssid[i].len); if ((ss->ss_flags & IEEE80211_SCAN_NOBCAST) == 0) ieee80211_send_probereq(vap->iv_bss, vap->iv_myaddr, ifp->if_broadcastaddr, ifp->if_broadcastaddr, "", 0); } /* * Scan curchan. If this is an active scan and the channel * is not marked passive then send probe request frame(s). * Arrange for the channel change after maxdwell ticks. */ static void scan_curchan(struct ieee80211_scan_state *ss, unsigned long maxdwell) { struct ieee80211vap *vap = ss->ss_vap; struct ieee80211com *ic = ss->ss_ic; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: calling; maxdwell=%lu\n", __func__, maxdwell); IEEE80211_LOCK(ic); if (ss->ss_flags & IEEE80211_SCAN_ACTIVE) ieee80211_probe_curchan(vap, 0); taskqueue_enqueue_timeout(ic->ic_tq, &SCAN_PRIVATE(ss)->ss_scan_curchan, maxdwell); IEEE80211_UNLOCK(ic); } static void scan_signal(struct ieee80211_scan_state *ss, int iflags) { struct ieee80211com *ic = ss->ss_ic; IEEE80211_UNLOCK_ASSERT(ic); IEEE80211_LOCK(ic); scan_signal_locked(ss, iflags); IEEE80211_UNLOCK(ic); } static void scan_signal_locked(struct ieee80211_scan_state *ss, int iflags) { struct scan_state *ss_priv = SCAN_PRIVATE(ss); struct timeout_task *scan_task = &ss_priv->ss_scan_curchan; struct ieee80211com *ic = ss->ss_ic; IEEE80211_LOCK_ASSERT(ic); ss_priv->ss_iflags |= iflags; if (ss_priv->ss_iflags & ISCAN_RUNNING) { if (taskqueue_cancel_timeout(ic->ic_tq, scan_task, NULL) == 0) taskqueue_enqueue_timeout(ic->ic_tq, scan_task, 0); } } /* * Handle mindwell requirements completed; initiate a channel * change to the next channel asap. */ static void scan_mindwell(struct ieee80211_scan_state *ss) { IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: called\n", __func__); scan_signal(ss, 0); } static void scan_start(void *arg, int pending) { #define ISCAN_REP (ISCAN_MINDWELL | ISCAN_DISCARD) struct ieee80211_scan_state *ss = (struct ieee80211_scan_state *) arg; struct scan_state *ss_priv = SCAN_PRIVATE(ss); struct ieee80211vap *vap = ss->ss_vap; struct ieee80211com *ic = ss->ss_ic; IEEE80211_LOCK(ic); if (vap == NULL || (ic->ic_flags & IEEE80211_F_SCAN) == 0 || (ss_priv->ss_iflags & ISCAN_ABORT)) { /* Cancelled before we started */ scan_done(ss, 0); return; } if (ss->ss_next == ss->ss_last) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: no channels to scan\n", __func__); scan_done(ss, 1); return; } /* * Put the station into power save mode. * * This is only required if we're not a full-offload devices; * those devices manage scan/traffic differently. */ if (((vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) == 0) && vap->iv_opmode == IEEE80211_M_STA && vap->iv_state == IEEE80211_S_RUN) { if ((vap->iv_bss->ni_flags & IEEE80211_NODE_PWR_MGT) == 0) { /* Enable station power save mode */ vap->iv_sta_ps(vap, 1); /* Wait until null data frame will be ACK'ed */ mtx_sleep(vap, IEEE80211_LOCK_OBJ(ic), PCATCH, "sta_ps", msecs_to_ticks(10)); if (ss_priv->ss_iflags & ISCAN_ABORT) { scan_done(ss, 0); return; } } } ss_priv->ss_scanend = ticks + ss_priv->ss_duration; /* XXX scan state can change! Re-validate scan state! */ IEEE80211_UNLOCK(ic); ic->ic_scan_start(ic); /* notify driver */ scan_curchan_task(ss, 0); } static void scan_curchan_task(void *arg, int pending) { struct ieee80211_scan_state *ss = arg; struct scan_state *ss_priv = SCAN_PRIVATE(ss); struct ieee80211com *ic = ss->ss_ic; struct ieee80211_channel *chan; unsigned long maxdwell; int scandone; IEEE80211_LOCK(ic); end: scandone = (ss->ss_next >= ss->ss_last) || (ss_priv->ss_iflags & ISCAN_CANCEL) != 0; IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: loop start; scandone=%d\n", __func__, scandone); if (scandone || (ss->ss_flags & IEEE80211_SCAN_GOTPICK) || (ss_priv->ss_iflags & ISCAN_ABORT) || ieee80211_time_after(ticks + ss->ss_mindwell, ss_priv->ss_scanend)) { ss_priv->ss_iflags &= ~ISCAN_RUNNING; scan_end(ss, scandone); return; } else ss_priv->ss_iflags |= ISCAN_RUNNING; chan = ss->ss_chans[ss->ss_next++]; /* * Watch for truncation due to the scan end time. */ if (ieee80211_time_after(ticks + ss->ss_maxdwell, ss_priv->ss_scanend)) maxdwell = ss_priv->ss_scanend - ticks; else maxdwell = ss->ss_maxdwell; IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: chan %3d%c -> %3d%c [%s, dwell min %lums max %lums]\n", __func__, ieee80211_chan2ieee(ic, ic->ic_curchan), ieee80211_channel_type_char(ic->ic_curchan), ieee80211_chan2ieee(ic, chan), ieee80211_channel_type_char(chan), (ss->ss_flags & IEEE80211_SCAN_ACTIVE) && (chan->ic_flags & IEEE80211_CHAN_PASSIVE) == 0 ? "active" : "passive", ticks_to_msecs(ss->ss_mindwell), ticks_to_msecs(maxdwell)); /* * Potentially change channel and phy mode. */ ic->ic_curchan = chan; ic->ic_rt = ieee80211_get_ratetable(chan); IEEE80211_UNLOCK(ic); /* * Perform the channel change and scan unlocked so the driver * may sleep. Once set_channel returns the hardware has * completed the channel change. */ ic->ic_set_channel(ic); ieee80211_radiotap_chan_change(ic); /* * Scan curchan. Drivers for "intelligent hardware" * override ic_scan_curchan to tell the device to do * the work. Otherwise we manage the work ourselves; * sending a probe request (as needed), and arming the * timeout to switch channels after maxdwell ticks. * * scan_curchan should only pause for the time required to * prepare/initiate the hardware for the scan (if at all). */ ic->ic_scan_curchan(ss, maxdwell); IEEE80211_LOCK(ic); /* XXX scan state can change! Re-validate scan state! */ ss_priv->ss_chanmindwell = ticks + ss->ss_mindwell; /* clear mindwell lock and initial channel change flush */ ss_priv->ss_iflags &= ~ISCAN_REP; if (ss_priv->ss_iflags & (ISCAN_CANCEL|ISCAN_ABORT)) { taskqueue_cancel_timeout(ic->ic_tq, &ss_priv->ss_scan_curchan, NULL); goto end; } IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: waiting\n", __func__); IEEE80211_UNLOCK(ic); } static void scan_end(struct ieee80211_scan_state *ss, int scandone) { struct scan_state *ss_priv = SCAN_PRIVATE(ss); struct ieee80211vap *vap = ss->ss_vap; struct ieee80211com *ic = ss->ss_ic; IEEE80211_LOCK_ASSERT(ic); IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: out\n", __func__); if (ss_priv->ss_iflags & ISCAN_ABORT) { scan_done(ss, scandone); return; } IEEE80211_UNLOCK(ic); ic->ic_scan_end(ic); /* notify driver */ IEEE80211_LOCK(ic); /* XXX scan state can change! Re-validate scan state! */ /* * Since a cancellation may have occurred during one of the * driver calls (whilst unlocked), update scandone. */ if (scandone == 0 && (ss_priv->ss_iflags & ISCAN_CANCEL) != 0) { /* XXX printf? */ if_printf(vap->iv_ifp, "%s: OOPS! scan cancelled during driver call (1)!\n", __func__); scandone = 1; } /* * Record scan complete time. Note that we also do * this when canceled so any background scan will * not be restarted for a while. */ if (scandone) ic->ic_lastscan = ticks; /* return to the bss channel */ if (ic->ic_bsschan != IEEE80211_CHAN_ANYC && ic->ic_curchan != ic->ic_bsschan) { ieee80211_setupcurchan(ic, ic->ic_bsschan); IEEE80211_UNLOCK(ic); ic->ic_set_channel(ic); ieee80211_radiotap_chan_change(ic); IEEE80211_LOCK(ic); } /* clear internal flags and any indication of a pick */ ss_priv->ss_iflags &= ~ISCAN_REP; ss->ss_flags &= ~IEEE80211_SCAN_GOTPICK; /* * If not canceled and scan completed, do post-processing. * If the callback function returns 0, then it wants to * continue/restart scanning. Unfortunately we needed to * notify the driver to end the scan above to avoid having * rx frames alter the scan candidate list. */ if ((ss_priv->ss_iflags & ISCAN_CANCEL) == 0 && !ss->ss_ops->scan_end(ss, vap) && (ss->ss_flags & IEEE80211_SCAN_ONCE) == 0 && ieee80211_time_before(ticks + ss->ss_mindwell, ss_priv->ss_scanend)) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: done, restart " "[ticks %u, dwell min %lu scanend %lu]\n", __func__, ticks, ss->ss_mindwell, ss_priv->ss_scanend); ss->ss_next = 0; /* reset to beginning */ if (ss->ss_flags & IEEE80211_SCAN_ACTIVE) vap->iv_stats.is_scan_active++; else vap->iv_stats.is_scan_passive++; ss->ss_ops->scan_restart(ss, vap); /* XXX? */ ieee80211_runtask(ic, &ss_priv->ss_scan_start); IEEE80211_UNLOCK(ic); return; } /* past here, scandone is ``true'' if not in bg mode */ if ((ss->ss_flags & IEEE80211_SCAN_BGSCAN) == 0) scandone = 1; IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: %s, [ticks %u, dwell min %lu scanend %lu]\n", __func__, scandone ? "done" : "stopped", ticks, ss->ss_mindwell, ss_priv->ss_scanend); /* * Since a cancellation may have occurred during one of the * driver calls (whilst unlocked), update scandone. */ if (scandone == 0 && (ss_priv->ss_iflags & ISCAN_CANCEL) != 0) { /* XXX printf? */ if_printf(vap->iv_ifp, "%s: OOPS! scan cancelled during driver call (2)!\n", __func__); scandone = 1; } scan_done(ss, scandone); } static void scan_done(struct ieee80211_scan_state *ss, int scandone) { struct scan_state *ss_priv = SCAN_PRIVATE(ss); struct ieee80211com *ic = ss->ss_ic; struct ieee80211vap *vap = ss->ss_vap; IEEE80211_LOCK_ASSERT(ic); /* * Clear the SCAN bit first in case frames are * pending on the station power save queue. If * we defer this then the dispatch of the frames * may generate a request to cancel scanning. */ ic->ic_flags &= ~IEEE80211_F_SCAN; /* * Drop out of power save mode when a scan has * completed. If this scan was prematurely terminated * because it is a background scan then don't notify * the ap; we'll either return to scanning after we * receive the beacon frame or we'll drop out of power * save mode because the beacon indicates we have frames * waiting for us. */ if (scandone) { /* * If we're not a scan offload device, come back out of * station powersave. Offload devices handle this themselves. */ if ((vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) == 0) vap->iv_sta_ps(vap, 0); if (ss->ss_next >= ss->ss_last) ic->ic_flags_ext &= ~IEEE80211_FEXT_BGSCAN; /* send 'scan done' event if not interrupted due to traffic. */ if (!(ss_priv->ss_iflags & ISCAN_INTERRUPT)) ieee80211_notify_scan_done(vap); } ss_priv->ss_iflags &= ~(ISCAN_PAUSE | ISCAN_ABORT); ss_priv->ss_scanend = 0; ss->ss_flags &= ~(IEEE80211_SCAN_ONCE | IEEE80211_SCAN_PICK1ST); IEEE80211_UNLOCK(ic); #undef ISCAN_REP } /* * Process a beacon or probe response frame. */ static void ieee80211_swscan_add_scan(struct ieee80211vap *vap, struct ieee80211_channel *curchan, const struct ieee80211_scanparams *sp, const struct ieee80211_frame *wh, int subtype, int rssi, int noise) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_scan_state *ss = ic->ic_scan; /* XXX locking */ /* * Frames received during startup are discarded to avoid * using scan state setup on the initial entry to the timer * callback. This can occur because the device may enable * rx prior to our doing the initial channel change in the * timer routine. */ if (SCAN_PRIVATE(ss)->ss_iflags & ISCAN_DISCARD) return; #ifdef IEEE80211_DEBUG if (ieee80211_msg_scan(vap) && (ic->ic_flags & IEEE80211_F_SCAN)) ieee80211_scan_dump_probe_beacon(subtype, 1, wh->i_addr2, sp, rssi); #endif if (ss->ss_ops != NULL && ss->ss_ops->scan_add(ss, curchan, sp, wh, subtype, rssi, noise)) { /* * If we've reached the min dwell time terminate * the timer so we'll switch to the next channel. */ if ((SCAN_PRIVATE(ss)->ss_iflags & ISCAN_MINDWELL) == 0 && ieee80211_time_after_eq(ticks, SCAN_PRIVATE(ss)->ss_chanmindwell)) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: chan %3d%c min dwell met (%u > %lu)\n", __func__, ieee80211_chan2ieee(ic, ic->ic_curchan), ieee80211_channel_type_char(ic->ic_curchan), ticks, SCAN_PRIVATE(ss)->ss_chanmindwell); SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_MINDWELL; /* * NB: trigger at next clock tick or wait for the * hardware. */ ic->ic_scan_mindwell(ss); } } } static struct ieee80211_scan_methods swscan_methods = { .sc_attach = ieee80211_swscan_attach, .sc_detach = ieee80211_swscan_detach, .sc_vattach = ieee80211_swscan_vattach, .sc_vdetach = ieee80211_swscan_vdetach, .sc_set_scan_duration = ieee80211_swscan_set_scan_duration, .sc_start_scan = ieee80211_swscan_start_scan, .sc_check_scan = ieee80211_swscan_check_scan, .sc_bg_scan = ieee80211_swscan_bg_scan, .sc_cancel_scan = ieee80211_swscan_cancel_scan, .sc_cancel_anyscan = ieee80211_swscan_cancel_anyscan, .sc_scan_next = ieee80211_swscan_scan_next, .sc_scan_done = ieee80211_swscan_scan_done, .sc_scan_probe_curchan = ieee80211_swscan_probe_curchan, .sc_add_scan = ieee80211_swscan_add_scan }; /* * Default scan attach method. */ void ieee80211_swscan_attach(struct ieee80211com *ic) { struct scan_state *ss; /* * Setup the default methods */ ic->ic_scan_methods = &swscan_methods; /* Allocate initial scan state */ ss = (struct scan_state *) IEEE80211_MALLOC(sizeof(struct scan_state), M_80211_SCAN, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (ss == NULL) { ic->ic_scan = NULL; return; } TASK_INIT(&ss->ss_scan_start, 0, scan_start, ss); TIMEOUT_TASK_INIT(ic->ic_tq, &ss->ss_scan_curchan, 0, scan_curchan_task, ss); ic->ic_scan = &ss->base; ss->base.ss_ic = ic; ic->ic_scan_curchan = scan_curchan; ic->ic_scan_mindwell = scan_mindwell; } Index: user/alc/PQ_LAUNDRY/sys/sys/buf.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/buf.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/sys/buf.h (revision 308054) @@ -1,542 +1,549 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; +struct vm_page; typedef unsigned char b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ unsigned short b_qindex; /* (Q) buffer queue index */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, vm_page_t *, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_00000100 0x00000100 /* Available flag. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_00040000 0x00040000 /* Available flag. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_00200000 0x00200000 /* Available flag. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15b12\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #define BUF_ASSERT_HELD(bp) #define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ gbflags, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, 0, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, struct buf **); void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bufdone_finish(struct buf *); void bd_speedup(void); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); + +typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t); +typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t); +int vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count, + int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, + vbg_get_blksize_t get_blksize); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_vnops.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_vnops.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_vnops.c (revision 308054) @@ -1,1950 +1,1822 @@ /*- * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_ffs.h" #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fdatasync_t ffs_fdatasync; static vop_fsync_t ffs_fsync; static vop_getpages_t ffs_getpages; static vop_lock1_t ffs_lock; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { struct vnode *vp; struct bufobj *bo; int error; vp = ap->a_vp; bo = &vp->v_bufobj; retry: error = ffs_syncvnode(vp, ap->a_waitfor, 0); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { error = softdep_fsync(vp); if (error) return (error); /* * The softdep_fsync() function may drop vp lock, * allowing for dirty buffers to reappear on the * bo_dirty list. Recheck and resync as needed. */ BO_LOCK(bo); if ((vp->v_type == VREG || vp->v_type == VDIR) && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { BO_UNLOCK(bo); goto retry; } BO_UNLOCK(bo); } return (0); } int ffs_syncvnode(struct vnode *vp, int waitfor, int flags) { struct inode *ip; struct bufobj *bo; struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; bool still_dirty, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; bo = &vp->v_bufobj; /* * When doing MNT_WAIT we must first flush all dependencies * on the inode. */ if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && (error = softdep_sync_metadata(vp)) != 0) return (error); /* * Flush all dirty buffers associated with a vnode. */ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; /* * Flush indirects in order, if requested. * * Note that if only datasync is requested, we can * skip indirect blocks when softupdates are not * active. Otherwise we must flush them with data, * since dependencies prevent data block writes. */ if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && (lbn_level(bp->b_lblkno) >= passes || ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { bp->b_vflags &= ~BV_SCANNED; goto next; } } else continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * Check for dependencies and potentially complete them. */ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); return (error); } /* If we deferred once, don't defer again. */ if ((bp->b_flags & B_DEFERRED) == 0) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); goto next; } } if (wait) { bremfree(bp); if ((error = bwrite(bp)) != 0) return (error); } else if ((bp->b_flags & B_CLUSTEROK)) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); (void) bawrite(bp); } next: /* * Since we may have slept during the I/O, we need * to start from a known point. */ BO_LOCK(bo); nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); } if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) return (0); else return (ffs_update(vp, 0)); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); /* * Block devices associated with filesystems may have new I/O * requests posted for them even if the vnode is locked, so no * amount of trying will get them clean. We make several passes * as a best effort. * * Regular files may need multiple passes to flush all dependency * work as it is possible that we must write once per indirect * level, once for the leaf, and once for the inode and each of * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { if ((flags & DATA_ONLY) == 0) { still_dirty = true; } else { /* * For data-only sync, dirty indirect buffers * are ignored. */ still_dirty = false; TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_lblkno > -NDADDR) { still_dirty = true; break; } } } if (still_dirty) { /* Write the inode after sync passes to flush deps. */ if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { BO_UNLOCK(bo); ffs_update(vp, 1); BO_LOCK(bo); } /* switch between sync/async. */ wait = !wait; if (wait || ++passes < NIADDR + 2) goto loop; #ifdef INVARIANTS if (!vn_isdisk(vp, NULL)) vn_printf(vp, "ffs_fsync: dirty "); #endif } } BO_UNLOCK(bo); error = 0; if ((flags & DATA_ONLY) == 0) { if ((flags & NO_INO_UPDT) == 0) error = ffs_update(vp, 1); if (DOINGSUJ(vp)) softdep_journal_fsync(VTOI(vp)); } return (error); } static int ffs_fdatasync(struct vop_fdatasync_args *ap) { return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); } static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { #ifndef NO_FFS_SNAPSHOT struct vnode *vp; int flags; struct lock *lkp; int result; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: vp = ap->a_vp; flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS KASSERT(vp->v_holdcnt != 0, ("ffs_lock %p: zero hold count", vp)); #endif lkp = vp->v_vnlock; result = _lockmgr_args(lkp, flags, VI_MTX(vp), LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ (void) _lockmgr_args(lkp, LK_RELEASE, NULL, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == (LK_INTERLOCK | LK_NOWAIT)) return (EBUSY); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } break; default: result = VOP_LOCK1_APV(&ufs_vnodeops, ap); } return (result); #else return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); #endif } /* * Vnode op for reading. */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; int seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ITOFS(ip); if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, GB_UNMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = blksize(fs, ip, nextlbn); error = breadn_flags(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. For non-direct VMIO reads, the VM * has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && (ip->i_flag & IN_ACCESS) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int seqcount; int blkoffset, error, flags, ioflag, size, xfersize; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ITOFS(ip); if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if (ioflag & IO_SYNC) flags |= IO_SYNC; flags |= BA_UNMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) { vnode_pager_setsize(vp, ip->i_size); break; } if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to UFS_BALLOC() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->fs_bsize == xfersize) vfs_bio_clrbuf(bp); if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, GB_UNMAPPED); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. For non-direct VMIO reads, the VM * has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int blkoffset, error, flags, size, xfersize; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if (ioflag & IO_SYNC) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); dp->di_mode = ip->i_mode; } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) { u_char *p, *pe, *pn, *p0; int eapad1, eapad2, ealength, ealen, nlen; uint32_t ul; pe = ptr + length; nlen = strlen(name); for (p = ptr; p < pe; p = pn) { p0 = p; bcopy(p, &ul, sizeof(ul)); pn = p + ul; /* make sure this entry is complete */ if (pn > pe) break; p += sizeof(uint32_t); if (*p != nspace) continue; p++; eapad2 = *p++; if (*p != nlen) continue; p++; if (bcmp(p, name, nlen)) continue; ealength = sizeof(uint32_t) + 3 + nlen; eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; ealength += eapad1; ealen = ul - ealength - eapad2; p += nlen + eapad1; if (eap != NULL) *eap = p0; if (eac != NULL) *eac = p; return (ealen); } return(-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct uio luio; struct iovec liovec; u_int easize; int error; u_char *eae; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; easize = dp->di_extsize; if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) return (EFBIG); eae = malloc(easize + extra, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return(error); } *p = eae; return (0); } static void ffs_lock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); while (ip->i_flag & IN_EA_LOCKED) { ip->i_flag |= IN_EA_LOCKWAIT; msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 0); } ip->i_flag |= IN_EA_LOCKED; VI_UNLOCK(vp); } static void ffs_unlock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); if (ip->i_flag & IN_EA_LOCKWAIT) wakeup(&ip->i_ea_refs); ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); VI_UNLOCK(vp); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area != NULL) { ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); if (error) { ffs_unlock_ea(vp); return (error); } ip->i_ea_len = dp->di_extsize; ip->i_ea_error = 0; ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec liovec; int error; struct ufs2_dinode *dp; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area == NULL) { ffs_unlock_ea(vp); return (EINVAL); } dp = ip->i_din2; error = ip->i_ea_error; if (commit && error == 0) { ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); if (cred == NOCRED) cred = vp->v_mount->mnt_cred; liovec.iov_base = ip->i_ea_area; liovec.iov_len = ip->i_ea_len; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = ip->i_ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; /* XXX: I'm not happy about truncating to zero size */ if (ip->i_ea_len < dp->di_extsize) error = ffs_truncate(vp, 0, IO_EXT, cred); error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); } if (--ip->i_ea_refs == 0) { free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; } ffs_unlock_ea(vp); return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; int ealen, olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); ealength = eapad1 = ealen = eapad2 = 0; eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(ENOATTR); } bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } if (easize > NXADDR * fs->fs_bsize) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; u_char *eae, *p; unsigned easize; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; u_char *eae, *p, *pe, *pn; unsigned easize; uint32_t ul; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); eae = ip->i_ea_area; easize = ip->i_ea_len; error = 0; if (ap->a_size != NULL) *ap->a_size = 0; pe = eae + easize; for(p = eae; error == 0 && p < pe; p = pn) { bcopy(p, &ul, sizeof(ul)); pn = p + ul; if (pn > pe) break; p += sizeof(ul); if (*p++ != ap->a_attrnamespace) continue; p++; /* pad2 */ ealen = *p; if (ap->a_size != NULL) { *ap->a_size += ealen + 1; } else if (ap->a_uio != NULL) { error = uiomove(p, ealen + 1, ap->a_uio); } } ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; ssize_t ealen; int olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); ealen = ap->a_uio->uio_resid; if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) return (EINVAL); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; eapad2 = 8 - (ealen % 8); if (eapad2 == 8) eapad2 = 0; ealength += eapad1 + ealen + eapad2; eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* new, append at end */ p = eae + easize; easize += ealength; } else { bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } } if (easize > lblktosize(fs, NXADDR)) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } bcopy(&ealength, p, sizeof(ealength)); p += sizeof(ealength); *p++ = ap->a_attrnamespace; *p++ = eapad2; *p++ = strlen(ap->a_name); strcpy(p, ap->a_name); p += strlen(ap->a_name); bzero(p, eapad1); p += eapad1; error = uiomove(p, ealen, ap->a_uio); if (error) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return(error); } p += ealen; bzero(p, eapad2); p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } SYSCTL_DECL(_vfs_ffs); static int use_buf_pager = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Always use buffer pager instead of bmap"); -static int buf_pager_relbuf; -SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, - &buf_pager_relbuf, 0, - "Make buffer pager release buffers after reading"); -/* - * The FFS pager. It uses buffer reads to validate pages. - * - * In contrast to the generic local pager from vm/vnode_pager.c, this - * pager correctly and easily handles volumes where the underlying - * device block size is greater than the machine page size. The - * buffer cache transparently extends the requested page run to be - * aligned at the block boundary, and does the necessary bogus page - * replacements in the addends to avoid obliterating already valid - * pages. - * - * The only non-trivial issue is that the exclusive busy state for - * pages, which is assumed by the vm_pager_getpages() interface, is - * incompatible with the VMIO buffer cache's desire to share-busy the - * pages. This function performs a trivial downgrade of the pages' - * state before reading buffers, and a less trivial upgrade from the - * shared-busy to excl-busy state after the read. - */ +static daddr_t +ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) +{ + + return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); +} + static int +ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + + return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); +} + +static int ffs_getpages(struct vop_getpages_args *ap) { struct vnode *vp; - vm_page_t *ma, m; - vm_object_t object; - struct buf *bp; struct ufsmount *um; - ufs_lbn_t lbn, lbnp; - vm_ooffset_t la, lb; - long bsize; - int bo_bs, count, error, i; - bool redo, lpart; vp = ap->a_vp; - ma = ap->a_m; - count = ap->a_count; + um = VFSTOUFS(vp->v_mount); - um = VFSTOUFS(ap->a_vp->v_mount); - bo_bs = um->um_devvp->v_bufobj.bo_bsize; - if (!use_buf_pager && bo_bs <= PAGE_SIZE) - return (vnode_pager_generic_getpages(vp, ma, count, + if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) + return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); - - object = vp->v_object; - la = IDX_TO_OFF(ma[count - 1]->pindex); - if (la >= object->un_pager.vnp.vnp_size) - return (VM_PAGER_BAD); - lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; - if (ap->a_rbehind != NULL) { - lb = IDX_TO_OFF(ma[0]->pindex); - *ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); - } - if (ap->a_rahead != NULL) { - *ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la); - if (la + IDX_TO_OFF(*ap->a_rahead) >= - object->un_pager.vnp.vnp_size) { - *ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager. - vnp.vnp_size, PAGE_SIZE) - la); - } - } - VM_OBJECT_WLOCK(object); -again: - for (i = 0; i < count; i++) - vm_page_busy_downgrade(ma[i]); - VM_OBJECT_WUNLOCK(object); - - lbnp = -1; - for (i = 0; i < count; i++) { - m = ma[i]; - - /* - * Pages are shared busy and the object lock is not - * owned, which together allow for the pages' - * invalidation. The racy test for validity avoids - * useless creation of the buffer for the most typical - * case when invalidation is not used in redo or for - * parallel read. The shared->excl upgrade loop at - * the end of the function catches the race in a - * reliable way (protected by the object lock). - */ - if (m->valid == VM_PAGE_BITS_ALL) - continue; - - lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex)); - if (lbn != lbnp) { - bsize = blksize(um->um_fs, VTOI(vp), lbn); - error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED, - &bp); - if (error != 0) - break; - KASSERT(1 /* racy, enable for debugging */ || - m->valid == VM_PAGE_BITS_ALL || i == count - 1, - ("buf %d %p invalid", i, m)); - if (i == count - 1 && lpart) { - VM_OBJECT_WLOCK(object); - if (m->valid != 0 && - m->valid != VM_PAGE_BITS_ALL) - vm_page_zero_invalid(m, TRUE); - VM_OBJECT_WUNLOCK(object); - } - if (LIST_EMPTY(&bp->b_dep)) { - /* - * Invalidation clears m->valid, but - * may leave B_CACHE flag if the - * buffer existed at the invalidation - * time. In this case, recycle the - * buffer to do real read on next - * bread() after redo. - * - * Otherwise B_RELBUF is not strictly - * necessary, enable to reduce buf - * cache pressure. - */ - if (buf_pager_relbuf || - m->valid != VM_PAGE_BITS_ALL) - bp->b_flags |= B_RELBUF; - - bp->b_flags &= ~B_NOCACHE; - brelse(bp); - } else { - bqrelse(bp); - } - lbnp = lbn; - } - } - - VM_OBJECT_WLOCK(object); - redo = false; - for (i = 0; i < count; i++) { - vm_page_sunbusy(ma[i]); - ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); - - /* - * Since the pages were only sbusy while neither the - * buffer nor the object lock was held by us, or - * reallocated while vm_page_grab() slept for busy - * relinguish, they could have been invalidated. - * Recheck the valid bits and re-read as needed. - * - * Note that the last page is made fully valid in the - * read loop, and partial validity for the page at - * index count - 1 could mean that the page was - * invalidated or removed, so we must restart for - * safety as well. - */ - if (ma[i]->valid != VM_PAGE_BITS_ALL) - redo = true; - } - if (redo && error == 0) - goto again; - VM_OBJECT_WUNLOCK(object); - return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); } Index: user/alc/PQ_LAUNDRY/sys/x86/cpufreq/hwpstate.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/cpufreq/hwpstate.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/x86/cpufreq/hwpstate.c (revision 308054) @@ -1,508 +1,510 @@ /*- * Copyright (c) 2005 Nate Lawson * Copyright (c) 2004 Colin Percival * Copyright (c) 2004-2005 Bruno Durcot * Copyright (c) 2004 FUKUDA Nobuhiko * Copyright (c) 2009 Michael Reifenberger * Copyright (c) 2009 Norikatsu Shigemura * Copyright (c) 2008-2009 Gen Otsuji * * This code is depending on kern_cpu.c, est.c, powernow.c, p4tcc.c, smist.c * in various parts. The authors of these files are Nate Lawson, * Colin Percival, Bruno Durcot, and FUKUDA Nobuhiko. * This code contains patches by Michael Reifenberger and Norikatsu Shigemura. * Thank you. * * Redistribution and use in source and binary forms, with or without * modification, are permitted providing that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * For more info: * BIOS and Kernel Developer's Guide(BKDG) for AMD Family 10h Processors * 31116 Rev 3.20 February 04, 2009 * BIOS and Kernel Developer's Guide(BKDG) for AMD Family 11h Processors * 41256 Rev 3.00 - July 07, 2008 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "cpufreq_if.h" #define MSR_AMD_10H_11H_LIMIT 0xc0010061 #define MSR_AMD_10H_11H_CONTROL 0xc0010062 #define MSR_AMD_10H_11H_STATUS 0xc0010063 #define MSR_AMD_10H_11H_CONFIG 0xc0010064 #define AMD_10H_11H_MAX_STATES 16 /* for MSR_AMD_10H_11H_LIMIT C001_0061 */ #define AMD_10H_11H_GET_PSTATE_MAX_VAL(msr) (((msr) >> 4) & 0x7) #define AMD_10H_11H_GET_PSTATE_LIMIT(msr) (((msr)) & 0x7) /* for MSR_AMD_10H_11H_CONFIG 10h:C001_0064:68 / 11h:C001_0064:6B */ #define AMD_10H_11H_CUR_VID(msr) (((msr) >> 9) & 0x7F) #define AMD_10H_11H_CUR_DID(msr) (((msr) >> 6) & 0x07) #define AMD_10H_11H_CUR_FID(msr) ((msr) & 0x3F) #define HWPSTATE_DEBUG(dev, msg...) \ do{ \ if(hwpstate_verbose) \ device_printf(dev, msg); \ }while(0) struct hwpstate_setting { int freq; /* CPU clock in Mhz or 100ths of a percent. */ int volts; /* Voltage in mV. */ int power; /* Power consumed in mW. */ int lat; /* Transition latency in us. */ int pstate_id; /* P-State id */ }; struct hwpstate_softc { device_t dev; struct hwpstate_setting hwpstate_settings[AMD_10H_11H_MAX_STATES]; int cfnum; }; static void hwpstate_identify(driver_t *driver, device_t parent); static int hwpstate_probe(device_t dev); static int hwpstate_attach(device_t dev); static int hwpstate_detach(device_t dev); static int hwpstate_set(device_t dev, const struct cf_setting *cf); static int hwpstate_get(device_t dev, struct cf_setting *cf); static int hwpstate_settings(device_t dev, struct cf_setting *sets, int *count); static int hwpstate_type(device_t dev, int *type); static int hwpstate_shutdown(device_t dev); static int hwpstate_features(driver_t *driver, u_int *features); static int hwpstate_get_info_from_acpi_perf(device_t dev, device_t perf_dev); static int hwpstate_get_info_from_msr(device_t dev); static int hwpstate_goto_pstate(device_t dev, int pstate_id); static int hwpstate_verbose = 0; SYSCTL_INT(_debug, OID_AUTO, hwpstate_verbose, CTLFLAG_RWTUN, &hwpstate_verbose, 0, "Debug hwpstate"); static device_method_t hwpstate_methods[] = { /* Device interface */ DEVMETHOD(device_identify, hwpstate_identify), DEVMETHOD(device_probe, hwpstate_probe), DEVMETHOD(device_attach, hwpstate_attach), DEVMETHOD(device_detach, hwpstate_detach), DEVMETHOD(device_shutdown, hwpstate_shutdown), /* cpufreq interface */ DEVMETHOD(cpufreq_drv_set, hwpstate_set), DEVMETHOD(cpufreq_drv_get, hwpstate_get), DEVMETHOD(cpufreq_drv_settings, hwpstate_settings), DEVMETHOD(cpufreq_drv_type, hwpstate_type), /* ACPI interface */ DEVMETHOD(acpi_get_features, hwpstate_features), {0, 0} }; static devclass_t hwpstate_devclass; static driver_t hwpstate_driver = { "hwpstate", hwpstate_methods, sizeof(struct hwpstate_softc), }; DRIVER_MODULE(hwpstate, cpu, hwpstate_driver, hwpstate_devclass, 0, 0); /* * Go to Px-state on all cpus considering the limit. */ static int hwpstate_goto_pstate(device_t dev, int pstate) { int i; uint64_t msr; int j; int limit; int id = pstate; int error; /* get the current pstate limit */ msr = rdmsr(MSR_AMD_10H_11H_LIMIT); limit = AMD_10H_11H_GET_PSTATE_LIMIT(msr); if(limit > id) id = limit; /* * We are going to the same Px-state on all cpus. * Probably should take _PSD into account. */ error = 0; CPU_FOREACH(i) { /* Bind to each cpu. */ thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); HWPSTATE_DEBUG(dev, "setting P%d-state on cpu%d\n", id, PCPU_GET(cpuid)); /* Go To Px-state */ wrmsr(MSR_AMD_10H_11H_CONTROL, id); } CPU_FOREACH(i) { /* Bind to each cpu. */ thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); /* wait loop (100*100 usec is enough ?) */ for(j = 0; j < 100; j++){ /* get the result. not assure msr=id */ msr = rdmsr(MSR_AMD_10H_11H_STATUS); if(msr == id){ break; } DELAY(100); } HWPSTATE_DEBUG(dev, "result P%d-state on cpu%d\n", (int)msr, PCPU_GET(cpuid)); if (msr != id) { HWPSTATE_DEBUG(dev, "error: loop is not enough.\n"); error = ENXIO; } } thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (error); } static int hwpstate_set(device_t dev, const struct cf_setting *cf) { struct hwpstate_softc *sc; struct hwpstate_setting *set; int i; if (cf == NULL) return (EINVAL); sc = device_get_softc(dev); set = sc->hwpstate_settings; for (i = 0; i < sc->cfnum; i++) if (CPUFREQ_CMP(cf->freq, set[i].freq)) break; if (i == sc->cfnum) return (EINVAL); return (hwpstate_goto_pstate(dev, set[i].pstate_id)); } static int hwpstate_get(device_t dev, struct cf_setting *cf) { struct hwpstate_softc *sc; struct hwpstate_setting set; uint64_t msr; sc = device_get_softc(dev); if (cf == NULL) return (EINVAL); msr = rdmsr(MSR_AMD_10H_11H_STATUS); if(msr >= sc->cfnum) return (EINVAL); set = sc->hwpstate_settings[msr]; cf->freq = set.freq; cf->volts = set.volts; cf->power = set.power; cf->lat = set.lat; cf->dev = dev; return (0); } static int hwpstate_settings(device_t dev, struct cf_setting *sets, int *count) { struct hwpstate_softc *sc; struct hwpstate_setting set; int i; if (sets == NULL || count == NULL) return (EINVAL); sc = device_get_softc(dev); if (*count < sc->cfnum) return (E2BIG); for (i = 0; i < sc->cfnum; i++, sets++) { set = sc->hwpstate_settings[i]; sets->freq = set.freq; sets->volts = set.volts; sets->power = set.power; sets->lat = set.lat; sets->dev = dev; } *count = sc->cfnum; return (0); } static int hwpstate_type(device_t dev, int *type) { if (type == NULL) return (EINVAL); *type = CPUFREQ_TYPE_ABSOLUTE; return (0); } static void hwpstate_identify(driver_t *driver, device_t parent) { if (device_find_child(parent, "hwpstate", -1) != NULL) return; if (cpu_vendor_id != CPU_VENDOR_AMD || CPUID_TO_FAMILY(cpu_id) < 0x10) return; /* * Check if hardware pstate enable bit is set. */ if ((amd_pminfo & AMDPM_HW_PSTATE) == 0) { HWPSTATE_DEBUG(parent, "hwpstate enable bit is not set.\n"); return; } if (resource_disabled("hwpstate", 0)) return; if (BUS_ADD_CHILD(parent, 10, "hwpstate", -1) == NULL) device_printf(parent, "hwpstate: add child failed\n"); } static int hwpstate_probe(device_t dev) { struct hwpstate_softc *sc; device_t perf_dev; uint64_t msr; int error, type; /* * Only hwpstate0. * It goes well with acpi_throttle. */ if (device_get_unit(dev) != 0) return (ENXIO); sc = device_get_softc(dev); sc->dev = dev; /* * Check if acpi_perf has INFO only flag. */ perf_dev = device_find_child(device_get_parent(dev), "acpi_perf", -1); error = TRUE; if (perf_dev && device_is_attached(perf_dev)) { error = CPUFREQ_DRV_TYPE(perf_dev, &type); if (error == 0) { if ((type & CPUFREQ_FLAG_INFO_ONLY) == 0) { /* * If acpi_perf doesn't have INFO_ONLY flag, * it will take care of pstate transitions. */ HWPSTATE_DEBUG(dev, "acpi_perf will take care of pstate transitions.\n"); return (ENXIO); } else { /* * If acpi_perf has INFO_ONLY flag, (_PCT has FFixedHW) * we can get _PSS info from acpi_perf * without going into ACPI. */ HWPSTATE_DEBUG(dev, "going to fetch info from acpi_perf\n"); error = hwpstate_get_info_from_acpi_perf(dev, perf_dev); } } } if (error == 0) { /* * Now we get _PSS info from acpi_perf without error. * Let's check it. */ msr = rdmsr(MSR_AMD_10H_11H_LIMIT); if (sc->cfnum != 1 + AMD_10H_11H_GET_PSTATE_MAX_VAL(msr)) { HWPSTATE_DEBUG(dev, "msr and acpi _PSS count mismatch.\n"); error = TRUE; } } /* * If we cannot get info from acpi_perf, * Let's get info from MSRs. */ if (error) error = hwpstate_get_info_from_msr(dev); if (error) return (error); device_set_desc(dev, "Cool`n'Quiet 2.0"); return (0); } static int hwpstate_attach(device_t dev) { return (cpufreq_register(dev)); } static int hwpstate_get_info_from_msr(device_t dev) { struct hwpstate_softc *sc; struct hwpstate_setting *hwpstate_set; uint64_t msr; int family, i, fid, did; family = CPUID_TO_FAMILY(cpu_id); sc = device_get_softc(dev); /* Get pstate count */ msr = rdmsr(MSR_AMD_10H_11H_LIMIT); sc->cfnum = 1 + AMD_10H_11H_GET_PSTATE_MAX_VAL(msr); hwpstate_set = sc->hwpstate_settings; for (i = 0; i < sc->cfnum; i++) { msr = rdmsr(MSR_AMD_10H_11H_CONFIG + i); - if ((msr & ((uint64_t)1 << 63)) != ((uint64_t)1 << 63)) { + if ((msr & ((uint64_t)1 << 63)) == 0) { HWPSTATE_DEBUG(dev, "msr is not valid.\n"); return (ENXIO); } did = AMD_10H_11H_CUR_DID(msr); fid = AMD_10H_11H_CUR_FID(msr); + + /* Convert fid/did to frequency. */ switch(family) { case 0x11: - /* fid/did to frequency */ - hwpstate_set[i].freq = 100 * (fid + 0x08) / (1 << did); + hwpstate_set[i].freq = (100 * (fid + 0x08)) >> did; break; case 0x10: - /* fid/did to frequency */ - hwpstate_set[i].freq = 100 * (fid + 0x10) / (1 << did); + case 0x12: + case 0x15: + case 0x16: + hwpstate_set[i].freq = (100 * (fid + 0x10)) >> did; break; default: - HWPSTATE_DEBUG(dev, "get_info_from_msr: AMD family %d CPU's are not implemented yet. sorry.\n", family); + HWPSTATE_DEBUG(dev, "get_info_from_msr: AMD family 0x%02x CPU's are not implemented yet. sorry.\n", family); return (ENXIO); - break; } hwpstate_set[i].pstate_id = i; /* There was volts calculation, but deleted it. */ hwpstate_set[i].volts = CPUFREQ_VAL_UNKNOWN; hwpstate_set[i].power = CPUFREQ_VAL_UNKNOWN; hwpstate_set[i].lat = CPUFREQ_VAL_UNKNOWN; } return (0); } static int hwpstate_get_info_from_acpi_perf(device_t dev, device_t perf_dev) { struct hwpstate_softc *sc; struct cf_setting *perf_set; struct hwpstate_setting *hwpstate_set; int count, error, i; perf_set = malloc(MAX_SETTINGS * sizeof(*perf_set), M_TEMP, M_NOWAIT); if (perf_set == NULL) { HWPSTATE_DEBUG(dev, "nomem\n"); return (ENOMEM); } /* * Fetch settings from acpi_perf. * Now it is attached, and has info only flag. */ count = MAX_SETTINGS; error = CPUFREQ_DRV_SETTINGS(perf_dev, perf_set, &count); if (error) { HWPSTATE_DEBUG(dev, "error: CPUFREQ_DRV_SETTINGS.\n"); goto out; } sc = device_get_softc(dev); sc->cfnum = count; hwpstate_set = sc->hwpstate_settings; for (i = 0; i < count; i++) { if (i == perf_set[i].spec[0]) { hwpstate_set[i].pstate_id = i; hwpstate_set[i].freq = perf_set[i].freq; hwpstate_set[i].volts = perf_set[i].volts; hwpstate_set[i].power = perf_set[i].power; hwpstate_set[i].lat = perf_set[i].lat; } else { HWPSTATE_DEBUG(dev, "ACPI _PSS object mismatch.\n"); error = ENXIO; goto out; } } out: if (perf_set) free(perf_set, M_TEMP); return (error); } static int hwpstate_detach(device_t dev) { hwpstate_goto_pstate(dev, 0); return (cpufreq_unregister(dev)); } static int hwpstate_shutdown(device_t dev) { /* hwpstate_goto_pstate(dev, 0); */ return (0); } static int hwpstate_features(driver_t *driver, u_int *features) { /* Notify the ACPI CPU that we support direct access to MSRs */ *features = ACPI_CAP_PERF_MSRS; return (0); } Index: user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h (revision 308054) @@ -1,120 +1,134 @@ /*- * Copyright (c) 1995 Bruce D. Evans. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_X86_VAR_H_ #define _X86_X86_VAR_H_ /* * Miscellaneous machine-dependent declarations. */ extern long Maxmem; extern u_int basemem; extern int busdma_swi_pending; extern u_int cpu_exthigh; extern u_int cpu_feature; extern u_int cpu_feature2; extern u_int amd_feature; extern u_int amd_feature2; extern u_int amd_pminfo; extern u_int via_feature_rng; extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; extern u_int cpu_mon_mwait_flags; extern u_int cpu_mon_min_size; extern u_int cpu_mon_max_size; extern u_int cpu_maxphyaddr; extern char ctx_switch_xsave[]; extern u_int hv_high; extern char hv_vendor[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode; extern int vm_page_dump_size; extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; struct pcb; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; struct trapframe; /* * The interface type of the interrupt handler entry point cannot be * expressed in C. Use simplest non-variadic function type as an * approximation. */ typedef void alias_for_inthand_t(void); +/* + * Returns the maximum physical address that can be used with the + * current system. + */ +static __inline vm_paddr_t +cpu_getmaxphyaddr(void) +{ +#if defined(__i386__) && !defined(PAE) + return (0xffffffff); +#else + return ((1ULL << cpu_maxphyaddr) - 1); +#endif +} + void *alloc_fpusave(int flags); void busdma_swi(void); bool cpu_mwait_usable(void); void cpu_probe_amdc1e(void); void cpu_setregs(void); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void identify_cpu(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int is_physical_memory(vm_paddr_t addr); int isa_nmi(int cd); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); void nmi_call_kdb_smp(u_int type, struct trapframe *frame); void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); int user_dbreg_trap(void); int minidumpsys(struct dumperinfo *); struct pcb *get_pcb_td(struct thread *td); #endif Index: user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c (revision 308053) +++ user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c (revision 308054) @@ -1,577 +1,577 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #ifdef __i386__ #include "opt_npx.h" #include "opt_apic.h" #include "opt_xbox.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; /* * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = (int *)PCPU_PTR(monitorbuf); KASSERT(*state == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); *state = STATE_MWAIT; cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ *state = STATE_RUNNING; } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); #ifndef PC98 static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); *state = STATE_RUNNING; } #endif /* !PC98 */ static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); *state = STATE_RUNNING; } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_MWAIT; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { enable_intr(); *state = STATE_RUNNING; return; } cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); *state = STATE_RUNNING; } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_RUNNING; /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } #if defined(__i386__) && defined(PC98) void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; #else void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; #endif void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { struct pcpu *pcpu; int *state; pcpu = pcpu_find(cpu); state = (int *)pcpu->pc_monitorbuf; /* * This doesn't need to be atomic since missing the race will * simply result in unnecessary IPIs. */ if (*state == STATE_SLEEPING) return (0); if (*state == STATE_MWAIT) *state = STATE_RUNNING; return (1); } /* * Ordered by speed/power consumption. */ struct { void *id_fn; char *id_name; } idle_tbl[] = { { cpu_idle_spin, "spin" }, { cpu_idle_mwait, "mwait" }, { cpu_idle_hlt, "hlt" }, #if !defined(__i386__) || !defined(PC98) { cpu_idle_acpi, "acpi" }, #endif { NULL, NULL } }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static int idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; char *p; int i; p = "unknown"; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif if (strcmp(idle_tbl[i].id_name, buf)) continue; cpu_idle_fn = idle_tbl[i].id_fn; return (0); } return (EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, idle_sysctl, "A", "currently selected idle function"); static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); #ifdef KDB int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, &kdb_on_nmi, 0, "Go to KDB on NMI"); #endif #ifdef DEV_ISA void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton for debugging. */ if (kdb_on_nmi) { - printf ("NMI/cpu%d ... going to debugger\n", cpu); + printf("NMI/cpu%d ... going to debugger\n", cpu); kdb_trap(type, 0, frame); } #endif /* KDB */ } else if (panic_on_nmi) { panic("NMI indicates hardware failure"); } } #endif void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef DEV_ISA #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif - nmi_call_kdb(0, type, frame); + nmi_call_kdb(PCPU_GET(cpuid), type, frame); #endif } Index: user/alc/PQ_LAUNDRY/usr.sbin/watchdogd/watchdogd.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/watchdogd/watchdogd.c (revision 308053) +++ user/alc/PQ_LAUNDRY/usr.sbin/watchdogd/watchdogd.c (revision 308054) @@ -1,792 +1,795 @@ /*- * Copyright (c) 2003-2004 Sean M. Kelly * Copyright (c) 2013 iXsystems.com, * author: Alfred Perlstein * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Software watchdog daemon. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static long fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok); static void parseargs(int, char *[]); static int seconds_to_pow2ns(int); static void sighandler(int); static void watchdog_loop(void); static int watchdog_init(void); static int watchdog_onoff(int onoff); static int watchdog_patpat(u_int timeout); static void usage(void); static int tstotv(struct timeval *tv, struct timespec *ts); static int tvtohz(struct timeval *tv); static int debugging = 0; static int end_program = 0; static const char *pidfile = _PATH_VARRUN "watchdogd.pid"; static u_int timeout = WD_TO_128SEC; static u_int exit_timeout = WD_TO_NEVER; static u_int pretimeout = 0; static u_int timeout_sec; -static u_int passive = 0; +static u_int nap = 10; +static int passive = 0; static int is_daemon = 0; static int is_dry_run = 0; /* do not arm the watchdog, only report on timing of the watch program */ static int do_timedog = 0; static int do_syslog = 1; static int fd = -1; -static int nap = 10; static int carp_thresh_seconds = -1; static char *test_cmd = NULL; static const char *getopt_shortopts; static int pretimeout_set; static int pretimeout_act; static int pretimeout_act_set; static int softtimeout_set; static int softtimeout_act; static int softtimeout_act_set; static struct option longopts[] = { { "debug", no_argument, &debugging, 1 }, { "pretimeout", required_argument, &pretimeout_set, 1 }, { "pretimeout-action", required_argument, &pretimeout_act_set, 1 }, { "softtimeout", no_argument, &softtimeout_set, 1 }, { "softtimeout-action", required_argument, &softtimeout_act_set, 1 }, { NULL, 0, NULL, 0} }; /* * Ask malloc() to map minimum-sized chunks of virtual address space at a time, * so that mlockall() won't needlessly wire megabytes of unused memory into the * process. This must be done using the malloc_conf string so that it gets set * up before the first allocation, which happens before entry to main(). */ const char * malloc_conf = "lg_chunk:0"; /* * Periodically pat the watchdog, preventing it from firing. */ int main(int argc, char *argv[]) { struct rtprio rtp; struct pidfh *pfh; pid_t otherpid; if (getuid() != 0) errx(EX_SOFTWARE, "not super user"); parseargs(argc, argv); if (do_syslog) openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR, LOG_DAEMON); rtp.type = RTP_PRIO_REALTIME; rtp.prio = 0; if (rtprio(RTP_SET, 0, &rtp) == -1) err(EX_OSERR, "rtprio"); if (!is_dry_run && watchdog_init() == -1) errx(EX_SOFTWARE, "unable to initialize watchdog"); if (is_daemon) { if (watchdog_onoff(1) == -1) err(EX_OSERR, "patting the dog"); pfh = pidfile_open(pidfile, 0600, &otherpid); if (pfh == NULL) { if (errno == EEXIST) { watchdog_onoff(0); errx(EX_SOFTWARE, "%s already running, pid: %d", getprogname(), otherpid); } warn("Cannot open or create pidfile"); } if (debugging == 0 && daemon(0, 0) == -1) { watchdog_onoff(0); pidfile_remove(pfh); err(EX_OSERR, "daemon"); } signal(SIGHUP, SIG_IGN); signal(SIGINT, sighandler); signal(SIGTERM, sighandler); pidfile_write(pfh); if (madvise(0, 0, MADV_PROTECT) != 0) warn("madvise failed"); if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) warn("mlockall failed"); watchdog_loop(); /* exiting */ pidfile_remove(pfh); return (EX_OK); } else { if (passive) timeout |= WD_PASSIVE; else timeout |= WD_ACTIVE; if (watchdog_patpat(timeout) < 0) err(EX_OSERR, "patting the dog"); return (EX_OK); } } static void pow2ns_to_ts(int pow2ns, struct timespec *ts) { uint64_t ns; ns = 1ULL << pow2ns; ts->tv_sec = ns / 1000000000ULL; ts->tv_nsec = ns % 1000000000ULL; } /* * Convert a timeout in seconds to N where 2^N nanoseconds is close to * "seconds". * * The kernel expects the timeouts for watchdogs in "2^N nanosecond format". */ static u_int parse_timeout_to_pow2ns(char opt, const char *longopt, const char *myoptarg) { double a; u_int rv; struct timespec ts; struct timeval tv; int ticks; char shortopt[] = "- "; if (!longopt) shortopt[1] = opt; a = fetchtimeout(opt, longopt, myoptarg, 1); if (a == 0) rv = WD_TO_NEVER; else rv = seconds_to_pow2ns(a); pow2ns_to_ts(rv, &ts); tstotv(&tv, &ts); ticks = tvtohz(&tv); if (debugging) { printf("Timeout for %s%s " "is 2^%d nanoseconds " "(in: %s sec -> out: %jd sec %ld ns -> %d ticks)\n", longopt ? "-" : "", longopt ? longopt : shortopt, rv, myoptarg, (intmax_t)ts.tv_sec, ts.tv_nsec, ticks); } if (ticks <= 0) { errx(1, "Timeout for %s%s is too small, please choose a higher timeout.", longopt ? "-" : "", longopt ? longopt : shortopt); } return (rv); } /* * Catch signals and begin shutdown process. */ static void sighandler(int signum) { if (signum == SIGINT || signum == SIGTERM) end_program = 1; } /* * Open the watchdog device. */ static int watchdog_init(void) { if (is_dry_run) return 0; fd = open("/dev/" _PATH_WATCHDOG, O_RDWR); if (fd >= 0) return (0); warn("Could not open watchdog device"); return (-1); } /* * If we are doing timing, then get the time. */ static int watchdog_getuptime(struct timespec *tp) { int error; if (!do_timedog) return 0; error = clock_gettime(CLOCK_UPTIME_FAST, tp); if (error) warn("clock_gettime"); return (error); } static long watchdog_check_dogfunction_time(struct timespec *tp_start, struct timespec *tp_end) { struct timeval tv_start, tv_end, tv_now, tv; const char *cmd_prefix, *cmd; struct timespec tp_now; int sec; if (!do_timedog) return (0); TIMESPEC_TO_TIMEVAL(&tv_start, tp_start); TIMESPEC_TO_TIMEVAL(&tv_end, tp_end); timersub(&tv_end, &tv_start, &tv); sec = tv.tv_sec; if (sec < carp_thresh_seconds) return (sec); if (test_cmd) { cmd_prefix = "Watchdog program"; cmd = test_cmd; } else { cmd_prefix = "Watchdog operation"; cmd = "stat(\"/etc\", &sb)"; } if (do_syslog) syslog(LOG_CRIT, "%s: '%s' took too long: " "%d.%06ld seconds >= %d seconds threshold", cmd_prefix, cmd, sec, (long)tv.tv_usec, carp_thresh_seconds); else warnx("%s: '%s' took too long: " "%d.%06ld seconds >= %d seconds threshold", cmd_prefix, cmd, sec, (long)tv.tv_usec, carp_thresh_seconds); /* * Adjust the sleep interval again in case syslog(3) took a non-trivial * amount of time to run. */ if (watchdog_getuptime(&tp_now)) return (sec); TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now); timersub(&tv_now, &tv_start, &tv); sec = tv.tv_sec; return (sec); } /* * Main program loop which is iterated every second. */ static void watchdog_loop(void) { struct timespec ts_start, ts_end; struct stat sb; long waited; int error, failed; while (end_program != 2) { failed = 0; error = watchdog_getuptime(&ts_start); if (error) { end_program = 1; goto try_end; } if (test_cmd != NULL) failed = system(test_cmd); else failed = stat("/etc", &sb); error = watchdog_getuptime(&ts_end); if (error) { end_program = 1; goto try_end; } if (failed == 0) watchdog_patpat(timeout|WD_ACTIVE); waited = watchdog_check_dogfunction_time(&ts_start, &ts_end); if (nap - waited > 0) sleep(nap - waited); try_end: if (end_program != 0) { if (watchdog_onoff(0) == 0) { end_program = 2; } else { warnx("Could not stop the watchdog, not exiting"); end_program = 0; } } } } /* * Reset the watchdog timer. This function must be called periodically * to keep the watchdog from firing. */ static int watchdog_patpat(u_int t) { if (is_dry_run) return 0; return ioctl(fd, WDIOCPATPAT, &t); } /* * Toggle the kernel's watchdog. This routine is used to enable and * disable the watchdog. */ static int watchdog_onoff(int onoff) { int error; /* fake successful watchdog op if a dry run */ if (is_dry_run) return 0; if (onoff) { /* * Call the WDIOC_SETSOFT regardless of softtimeout_set * because we'll need to turn it off if someone had turned * it on. */ error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set); if (error) { warn("setting WDIOC_SETSOFT %d", softtimeout_set); return (error); } error = watchdog_patpat((timeout|WD_ACTIVE)); if (error) { warn("watchdog_patpat failed"); goto failsafe; } if (softtimeout_act_set) { error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT, &softtimeout_act); if (error) { warn("setting WDIOC_SETSOFTTIMEOUTACT %d", softtimeout_act); goto failsafe; } } if (pretimeout_set) { error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout); if (error) { warn("setting WDIOC_SETPRETIMEOUT %d", pretimeout); goto failsafe; } } if (pretimeout_act_set) { error = ioctl(fd, WDIOC_SETPRETIMEOUTACT, &pretimeout_act); if (error) { warn("setting WDIOC_SETPRETIMEOUTACT %d", pretimeout_act); goto failsafe; } } /* pat one more time for good measure */ return watchdog_patpat((timeout|WD_ACTIVE)); } else { return watchdog_patpat(exit_timeout); } failsafe: watchdog_patpat(exit_timeout); return (error); } /* * Tell user how to use the program. */ static void usage(void) { if (is_daemon) fprintf(stderr, "usage:\n" " watchdogd [-dnSw] [-e cmd] [-I pidfile] [-s sleep] [-t timeout]\n" " [-T script_timeout] [-x exit_timeout]\n" " [--debug]\n" " [--pretimeout seconds] [-pretimeout-action action]\n" " [--softtimeout] [-softtimeout-action action]\n" ); else fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n"); exit(EX_USAGE); } static long fetchtimeout(int opt, const char *longopt, const char *myoptarg, int zero_ok) { const char *errstr; char *p; long rv; errstr = NULL; p = NULL; errno = 0; rv = strtol(myoptarg, &p, 0); if ((p != NULL && *p != '\0') || errno != 0) errstr = "is not a number"; if (rv < 0 || (!zero_ok && rv == 0)) errstr = "must be greater than zero"; if (errstr) { if (longopt) errx(EX_USAGE, "--%s argument %s", longopt, errstr); else errx(EX_USAGE, "-%c argument %s", opt, errstr); } return (rv); } struct act_tbl { const char *at_act; int at_value; }; static const struct act_tbl act_tbl[] = { { "panic", WD_SOFT_PANIC }, { "ddb", WD_SOFT_DDB }, { "log", WD_SOFT_LOG }, { "printf", WD_SOFT_PRINTF }, { NULL, 0 } }; static void timeout_act_error(const char *lopt, const char *badact) { char *opts, *oldopts; int i; opts = NULL; for (i = 0; act_tbl[i].at_act != NULL; i++) { oldopts = opts; if (asprintf(&opts, "%s%s%s", oldopts == NULL ? "" : oldopts, oldopts == NULL ? "" : ", ", act_tbl[i].at_act) == -1) err(EX_OSERR, "malloc"); free(oldopts); } warnx("bad --%s argument '%s' must be one of (%s).", lopt, badact, opts); usage(); } /* * Take a comma separated list of actions and or the flags * together for the ioctl. */ static int timeout_act_str2int(const char *lopt, const char *acts) { int i; char *dupacts, *tofree; char *o; int rv = 0; tofree = dupacts = strdup(acts); if (!tofree) err(EX_OSERR, "malloc"); while ((o = strsep(&dupacts, ",")) != NULL) { for (i = 0; act_tbl[i].at_act != NULL; i++) { if (!strcmp(o, act_tbl[i].at_act)) { rv |= act_tbl[i].at_value; break; } } if (act_tbl[i].at_act == NULL) timeout_act_error(lopt, o); } free(tofree); return rv; } int tstotv(struct timeval *tv, struct timespec *ts) { tv->tv_sec = ts->tv_sec; tv->tv_usec = ts->tv_nsec / 1000; return 0; } /* * Convert a timeval to a number of ticks. * Mostly copied from the kernel. */ int tvtohz(struct timeval *tv) { register unsigned long ticks; register long sec, usec; int hz; size_t hzsize; int error; int tick; hzsize = sizeof(hz); error = sysctlbyname("kern.hz", &hz, &hzsize, NULL, 0); if (error) err(1, "sysctlbyname kern.hz"); tick = 1000000 / hz; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) / tick + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + ((unsigned long)usec + (tick - 1)) / tick + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } static int seconds_to_pow2ns(int seconds) { uint64_t power; uint64_t ns; uint64_t shifted; if (seconds <= 0) errx(1, "seconds %d < 0", seconds); ns = ((uint64_t)seconds) * 1000000000ULL; power = flsll(ns); shifted = 1ULL << power; if (shifted <= ns) { power++; } if (debugging) { printf("shifted %lld\n", (long long)shifted); printf("seconds_to_pow2ns: seconds: %d, ns %lld, power %d\n", seconds, (long long)ns, (int)power); } return (power); } /* * Handle the few command line arguments supported. */ static void parseargs(int argc, char *argv[]) { int longindex; int c; const char *lopt; /* * if we end with a 'd' aka 'watchdogd' then we are the daemon program, * otherwise run as a command line utility. */ c = strlen(argv[0]); if (argv[0][c - 1] == 'd') is_daemon = 1; if (is_daemon) getopt_shortopts = "I:de:ns:t:ST:wx:?"; else getopt_shortopts = "dt:?"; while ((c = getopt_long(argc, argv, getopt_shortopts, longopts, &longindex)) != -1) { switch (c) { case 'I': pidfile = optarg; break; case 'd': debugging = 1; break; case 'e': test_cmd = strdup(optarg); break; case 'n': is_dry_run = 1; break; #ifdef notyet case 'p': passive = 1; break; #endif case 's': nap = fetchtimeout(c, NULL, optarg, 0); break; case 'S': do_syslog = 0; break; case 't': timeout_sec = atoi(optarg); timeout = parse_timeout_to_pow2ns(c, NULL, optarg); if (debugging) printf("Timeout is 2^%d nanoseconds\n", timeout); break; case 'T': carp_thresh_seconds = fetchtimeout(c, "NULL", optarg, 0); break; case 'w': do_timedog = 1; break; case 'x': exit_timeout = parse_timeout_to_pow2ns(c, NULL, optarg); if (exit_timeout != 0) exit_timeout |= WD_ACTIVE; break; case 0: lopt = longopts[longindex].name; if (!strcmp(lopt, "pretimeout")) { pretimeout = fetchtimeout(0, lopt, optarg, 0); } else if (!strcmp(lopt, "pretimeout-action")) { pretimeout_act = timeout_act_str2int(lopt, optarg); } else if (!strcmp(lopt, "softtimeout-action")) { softtimeout_act = timeout_act_str2int(lopt, optarg); } else { /* warnx("bad option at index %d: %s", optind, argv[optind]); usage(); */ } break; case '?': default: usage(); /* NOTREACHED */ } } + + if (nap > timeout_sec / 2) + nap = timeout_sec / 2; if (carp_thresh_seconds == -1) carp_thresh_seconds = nap; if (argc != optind) errx(EX_USAGE, "extra arguments."); if (is_daemon && timeout < WD_TO_1SEC) errx(EX_USAGE, "-t argument is less than one second."); if (pretimeout_set) { struct timespec ts; pow2ns_to_ts(timeout, &ts); if (pretimeout >= (uintmax_t)ts.tv_sec) { errx(EX_USAGE, "pretimeout (%d) >= timeout (%d -> %ld)\n" "see manual section TIMEOUT RESOLUTION", pretimeout, timeout_sec, (long)ts.tv_sec); } } } Index: user/alc/PQ_LAUNDRY =================================================================== --- user/alc/PQ_LAUNDRY (revision 308053) +++ user/alc/PQ_LAUNDRY (revision 308054) Property changes on: user/alc/PQ_LAUNDRY ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r307981-308053