Index: projects/runtime-coverage-v2/share/man/man4/ng_eiface.4 =================================================================== --- projects/runtime-coverage-v2/share/man/man4/ng_eiface.4 (revision 347598) +++ projects/runtime-coverage-v2/share/man/man4/ng_eiface.4 (revision 347599) @@ -1,122 +1,122 @@ .\" Copyright (c) 2004 Gleb Smirnoff .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd February 3, 2005 +.Dd May 14, 2019 .Dt NG_EIFACE 4 .Os .Sh NAME .Nm ng_eiface .Nd "generic Ethernet interface netgraph node type" .Sh SYNOPSIS .In netgraph/ng_eiface.h .Sh DESCRIPTION The .Vt eiface netgraph node implements the generic Ethernet interface. -When +When an .Vt eiface node is created, a new interface appears which is accessible via .Xr ifconfig 8 . These interfaces are named .Dq Li ngeth0 , .Dq Li ngeth1 , etc. When a node is shut down, the corresponding interface is removed, and the interface name becomes available for reuse by future .Vt eiface nodes. New nodes always take the first unused interface. .Sh HOOKS An .Vt eiface node has a single hook named .Va ether , which should be connected to the Ethernet downstream, for example, to the .Xr ng_vlan 4 node. Packets transmitted via the interface flow out this hook. Similarly, packets received on the hook go to the protocol stack as packets received by any real Ethernet interface. .Sh CONTROL MESSAGES This node type supports the generic control messages, plus the following: .Bl -tag -width foo .It Dv NGM_EIFACE_SET Pq Ic set Set link-level address of the interface. Requires .Vt "struct ether_addr" as an argument. This message also has an .Tn ASCII version, called .Dq Li set , which requires as an argument an .Tn ASCII string consisting of 6 colon-separated hex digits. .It Dv NGM_EIFACE_GET_IFNAME Pq Ic getifname Return the name of the associated interface as a -.Dv NUL Ns -terminated +.Dv NULL Ns -terminated .Tn ASCII string. .It Dv NGM_EIFACE_GET_IFADDRS Return the list of link-level addresses associated with the node. .El .Sh SHUTDOWN This node shuts down upon receipt of a .Dv NGM_SHUTDOWN control message. The associated interface is removed and its name becomes available for reuse by future .Vt eiface nodes. .Pp Unlike most other node types, an .Vt eiface node does .Em not -go away when all hooks have been disconnected; rather, and explicit +go away when all hooks have been disconnected; rather, an explicit .Dv NGM_SHUTDOWN control message is required. .Sh SEE ALSO .Xr netgraph 4 , .Xr ng_ether 4 , .Xr ng_iface 4 , .Xr ng_vlan 4 , .Xr ifconfig 8 , .Xr ngctl 8 .Sh HISTORY The .Vt eiface node type was implemented in .Fx 4.6 . .Sh AUTHORS .An -nosplit The .Vt eiface node type was written by .An Vitaly V Belekhov . This manual page was written by .An Gleb Smirnoff . Index: projects/runtime-coverage-v2/sys/amd64/amd64/exception.S =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/exception.S (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/exception.S (revision 347599) @@ -1,1324 +1,1326 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * Copyright (c) 2007-2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" #include "assym.inc" #include #include #include #include #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr .align 8 .type dtrace_invop_jump_addr,@object .size dtrace_invop_jump_addr,8 dtrace_invop_jump_addr: .zero 8 .globl dtrace_invop_calltrap_addr .align 8 .type dtrace_invop_calltrap_addr,@object .size dtrace_invop_calltrap_addr,8 dtrace_invop_calltrap_addr: .zero 8 #endif .text #ifdef HWPMC_HOOKS ENTRY(start_exceptions) #endif /*****************************************************************************/ /* Trap handling */ /*****************************************************************************/ /* * Trap and fault vector routines. * * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes * state on the stack but also disables interrupts. This is important for * us for the use of the swapgs instruction. We cannot be interrupted * until the GS.base value is correct. For most traps, we automatically * then enable interrupts if the interrupted context had them enabled. * This is equivalent to the i386 port's use of SDT_SYS386TGT. * * The cpu will push a certain amount of state onto the kernel stack for * the current process. See amd64/include/frame.h. * This includes the current RFLAGS (status register, which includes * the interrupt disable state prior to the trap), the code segment register, * and the return instruction pointer are pushed by the cpu. The cpu * will also push an 'error' code for certain traps. We push a dummy * error code for those traps where the cpu doesn't in order to maintain * a consistent frame. We also push a contrived 'trap number'. * * The CPU does not push the general registers, so we must do that, and we * must restore them prior to calling 'iret'. The CPU adjusts %cs and %ss * but does not mess with %ds, %es, %gs or %fs. We swap the %gs base for * for the kernel mode operation shortly, without changes to the selector * loaded. Since superuser long mode works with any selectors loaded into * segment registers other then %cs, which makes them mostly unused in long * mode, and kernel does not reference %fs, leave them alone. The segment * registers are reloaded on return to the usermode. */ MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) /* Traps that we leave interrupts disabled for. */ .macro TRAP_NOEN l, trapno PTI_ENTRY \l,X\l .globl X\l .type X\l,@function X\l: subq $TF_RIP,%rsp movl $\trapno,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) jmp alltraps_noen .endm TRAP_NOEN bpt, T_BPTFLT #ifdef KDTRACE_HOOKS TRAP_NOEN dtrace_ret, T_DTRACE_RET #endif /* Regular traps; The cpu does not supply tf_err for these. */ .macro TRAP l, trapno PTI_ENTRY \l,X\l .globl X\l .type X\l,@function X\l: subq $TF_RIP,%rsp movl $\trapno,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) jmp alltraps .endm TRAP div, T_DIVIDE TRAP ofl, T_OFLOW TRAP bnd, T_BOUND TRAP ill, T_PRIVINFLT TRAP dna, T_DNA TRAP fpusegm, T_FPOPFLT TRAP rsvd, T_RESERVED TRAP fpu, T_ARITHTRAP TRAP xmm, T_XMMFLT /* This group of traps have tf_err already pushed by the cpu. */ .macro TRAP_ERR l, trapno PTI_ENTRY \l,X\l,has_err=1 .globl X\l .type X\l,@function X\l: subq $TF_ERR,%rsp movl $\trapno,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) jmp alltraps .endm TRAP_ERR tss, T_TSSFLT TRAP_ERR align, T_ALIGNFLT /* * alltraps entry point. Use swapgs if this is the first time in the * kernel from userland. Reenable interrupts if they were enabled * before the trap. This approximates SDT_SYS386TGT on the i386 port. */ SUPERALIGN_TEXT .globl alltraps .type alltraps,@function alltraps: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) 1: SAVE_SEGS movq %rdx,TF_RDX(%rsp) movq %rax,TF_RAX(%rsp) movq %rcx,TF_RCX(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) jz 2f call handle_ibrs_entry 2: testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rax sti alltraps_pushregs_no_rax: movq %rsi,TF_RSI(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) movl $TF_HASSEGS,TF_FLAGS(%rsp) pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq FAKE_MCOUNT(TF_RIP(%rsp)) #ifdef KDTRACE_HOOKS /* * DTrace Function Boundary Trace (fbt) probes are triggered * by int3 (0xcc) which causes the #BP (T_BPTFLT) breakpoint * interrupt. For all other trap types, just handle them in * the usual way. */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jnz calltrap /* ignore userland traps */ cmpl $T_BPTFLT,TF_TRAPNO(%rsp) jne calltrap /* Check if there is no DTrace hook registered. */ cmpq $0,dtrace_invop_jump_addr je calltrap /* * Set our jump address for the jump back in the event that * the breakpoint wasn't caused by DTrace at all. */ movq $calltrap,dtrace_invop_calltrap_addr(%rip) /* Jump to the code hooked in by DTrace. */ jmpq *dtrace_invop_jump_addr #endif .globl calltrap .type calltrap,@function calltrap: movq %rsp,%rdi call trap_check MEXITCOUNT jmp doreti /* Handle any pending ASTs */ /* * alltraps_noen entry point. Unlike alltraps above, we want to * leave the interrupts disabled. This corresponds to * SDT_SYS386IGT on the i386 port. */ SUPERALIGN_TEXT .globl alltraps_noen .type alltraps_noen,@function alltraps_noen: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) 1: SAVE_SEGS movq %rdx,TF_RDX(%rsp) movq %rax,TF_RAX(%rsp) movq %rcx,TF_RCX(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) jz alltraps_pushregs_no_rax call handle_ibrs_entry jmp alltraps_pushregs_no_rax IDTVEC(dblfault) subq $TF_ERR,%rsp movl $T_DOUBLEFLT,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs 1: movq PCPU(KCR3),%rax cmpq $~0,%rax je 2f movq %rax,%cr3 2: movq %rsp,%rdi call dblfault_handler 3: hlt jmp 3b ALIGN_TEXT IDTVEC(page_pti) testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) jz Xpage swapgs pushq %rax movq %cr3,%rax movq %rax,PCPU(SAVED_UCR3) cmpq $~0,PCPU(UCR3) jne 1f popq %rax jmp 2f 1: pushq %rdx PTI_UUENTRY has_err=1 2: subq $TF_ERR,%rsp movq %rdi,TF_RDI(%rsp) movq %rax,TF_RAX(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) jmp page_u IDTVEC(page) subq $TF_ERR,%rsp movq %rdi,TF_RDI(%rsp) /* free up GP registers */ movq %rax,TF_RAX(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz page_cr2 /* already running with kernel GS.base */ swapgs page_u: movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) movq PCPU(SAVED_UCR3),%rax movq %rax,PCB_SAVED_UCR3(%rdi) call handle_ibrs_entry page_cr2: movq %cr2,%rdi /* preserve %cr2 before .. */ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */ SAVE_SEGS movl $T_PAGEFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rax sti jmp alltraps_pushregs_no_rax /* * We have to special-case this one. If we get a trap in doreti() at * the iretq stage, we'll reenter with the wrong gs state. We'll have * to do a special the swapgs in this case even coming from the kernel. * XXX linux has a trap handler for their equivalent of load_gs(). * * On the stack, we have the hardware interrupt frame to return * to usermode (faulted) and another frame with error code, for * fault. For PTI, copy both frames to the main thread stack. * Handle the potential 16-byte alignment adjustment incurred * during the second fault by copying both frames independently * while unwinding the stack in between. */ .macro PROTF_ENTRY name,trapno \name\()_pti_doreti: swapgs cmpq $~0,PCPU(UCR3) je 1f pushq %rax pushq %rdx movq PCPU(KCR3),%rax movq %rax,%cr3 movq PCPU(RSP0),%rax subq $2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */ MOVE_STACKS (PTI_SIZE / 8) addq $PTI_SIZE,%rax movq PTI_RSP(%rsp),%rsp MOVE_STACKS (PTI_SIZE / 8 - 3) subq $PTI_SIZE,%rax movq %rax,%rsp popq %rdx popq %rax 1: swapgs jmp X\name IDTVEC(\name\()_pti) cmpq $doreti_iret,PTI_RIP-2*8(%rsp) je \name\()_pti_doreti testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */ jz X\name PTI_UENTRY has_err=1 swapgs IDTVEC(\name) subq $TF_ERR,%rsp movl $\trapno,TF_TRAPNO(%rsp) jmp prot_addrf .endm PROTF_ENTRY missing, T_SEGNPFLT PROTF_ENTRY stk, T_STKFLT PROTF_ENTRY prot, T_PROTFLT prot_addrf: movq $0,TF_ADDR(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ movq %rax,TF_RAX(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) leaq doreti_iret(%rip),%rdi cmpq %rdi,TF_RIP(%rsp) je 5f /* kernel but with user gsbase!! */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 6f /* already running with kernel GS.base */ testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) jz 2f cmpw $KUF32SEL,TF_FS(%rsp) jne 1f rdfsbase %rax 1: cmpw $KUG32SEL,TF_GS(%rsp) jne 2f rdgsbase %rdx 2: swapgs movq PCPU(CURPCB),%rdi testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) jz 4f cmpw $KUF32SEL,TF_FS(%rsp) jne 3f movq %rax,PCB_FSBASE(%rdi) 3: cmpw $KUG32SEL,TF_GS(%rsp) jne 4f movq %rdx,PCB_GSBASE(%rdi) 4: call handle_ibrs_entry orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */ movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rax sti jmp alltraps_pushregs_no_rax 5: swapgs 6: movq PCPU(CURPCB),%rdi jmp 4b /* * Fast syscall entry point. We enter here with just our new %cs/%ss set, * and the new privilige level. We are still running on the old user stack * pointer. We have to juggle a few things around to find our stack etc. * swapgs gives us access to our PCPU space only. * * We do not support invoking this from a custom segment registers, * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT. */ SUPERALIGN_TEXT IDTVEC(fast_syscall_pti) swapgs movq %rax,PCPU(SCRATCH_RAX) cmpq $~0,PCPU(UCR3) je fast_syscall_common movq PCPU(KCR3),%rax movq %rax,%cr3 jmp fast_syscall_common SUPERALIGN_TEXT IDTVEC(fast_syscall) swapgs movq %rax,PCPU(SCRATCH_RAX) fast_syscall_common: movq %rsp,PCPU(SCRATCH_RSP) movq PCPU(RSP0),%rsp /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */ subq $TF_SIZE,%rsp /* defer TF_RSP till we have a spare register */ movq %r11,TF_RFLAGS(%rsp) movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */ movq %r11,TF_RSP(%rsp) /* user stack pointer */ movq PCPU(SCRATCH_RAX),%rax /* * Save a few arg registers early to free them for use in * handle_ibrs_entry(). %r10 is especially tricky. It is not an * arg register, but it holds the arg register %rcx. Profiling * preserves %rcx, but may clobber %r10. Profiling may also * clobber %r11, but %r11 (original %eflags) has been saved. */ movq %rax,TF_RAX(%rsp) /* syscall number */ movq %rdx,TF_RDX(%rsp) /* arg 3 */ movq %r10,TF_RCX(%rsp) /* arg 4 */ SAVE_SEGS call handle_ibrs_entry movq PCPU(CURPCB),%r11 andl $~PCB_FULL_IRET,PCB_FLAGS(%r11) sti movq $KUDSEL,TF_SS(%rsp) movq $KUCSEL,TF_CS(%rsp) movq $2,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) /* arg 1 */ movq %rsi,TF_RSI(%rsp) /* arg 2 */ movq %r8,TF_R8(%rsp) /* arg 5 */ movq %r9,TF_R9(%rsp) /* arg 6 */ movq %rbx,TF_RBX(%rsp) /* C preserved */ movq %rbp,TF_RBP(%rsp) /* C preserved */ movq %r12,TF_R12(%rsp) /* C preserved */ movq %r13,TF_R13(%rsp) /* C preserved */ movq %r14,TF_R14(%rsp) /* C preserved */ movq %r15,TF_R15(%rsp) /* C preserved */ movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) movq PCPU(CURTHREAD),%rdi movq %rsp,TD_FRAME(%rdi) movl TF_RFLAGS(%rsp),%esi andl $PSL_T,%esi call amd64_syscall 1: movq PCPU(CURPCB),%rax /* Disable interrupts before testing PCB_FULL_IRET. */ cli testl $PCB_FULL_IRET,PCB_FLAGS(%rax) jnz 4f /* Check for and handle AST's on return to userland. */ movq PCPU(CURTHREAD),%rax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) jne 3f call handle_ibrs_exit + callq *mds_handler /* Restore preserved registers. */ MEXITCOUNT movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */ movq TF_RSI(%rsp),%rsi /* bonus: preserve arg 2 */ movq TF_RDX(%rsp),%rdx /* return value 2 */ movq TF_RAX(%rsp),%rax /* return value 1 */ movq TF_RFLAGS(%rsp),%r11 /* original %rflags */ movq TF_RIP(%rsp),%rcx /* original %rip */ movq TF_RSP(%rsp),%rsp /* user stack pointer */ xorl %r8d,%r8d /* zero the rest of GPRs */ xorl %r10d,%r10d cmpq $~0,PCPU(UCR3) je 2f movq PCPU(UCR3),%r9 movq %r9,%cr3 2: xorl %r9d,%r9d swapgs sysretq 3: /* AST scheduled. */ sti movq %rsp,%rdi call ast jmp 1b 4: /* Requested full context restore, use doreti for that. */ MEXITCOUNT jmp doreti /* * Here for CYA insurance, in case a "syscall" instruction gets * issued from 32 bit compatibility mode. MSR_CSTAR has to point * to *something* if EFER_SCE is enabled. */ IDTVEC(fast_syscall32) sysret /* * DB# handler is very similar to NM#, because 'mov/pop %ss' delay * generation of exception until the next instruction is executed, * which might be a kernel entry. So we must execute the handler * on IST stack and be ready for non-kernel GSBASE. */ IDTVEC(dbg) subq $TF_RIP,%rsp movl $(T_TRCTRAP),TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq testb $SEL_RPL_MASK,TF_CS(%rsp) jnz dbg_fromuserspace /* * We've interrupted the kernel. Preserve GS.base in %r12, * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d. */ movl $MSR_GSBASE,%ecx rdmsr movq %rax,%r12 shlq $32,%rdx orq %rdx,%r12 /* Retrieve and load the canonical value for GS.base. */ movq TF_SIZE(%rsp),%rdx movl %edx,%eax shrq $32,%rdx wrmsr movq %cr3,%r13 movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je 2f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr movl %eax,%r14d call handle_ibrs_entry 2: FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp,%rdi call trap MEXITCOUNT testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je 3f movl %r14d,%eax xorl %edx,%edx movl $MSR_IA32_SPEC_CTRL,%ecx wrmsr /* * Put back the preserved MSR_GSBASE value. */ 3: movl $MSR_GSBASE,%ecx movq %r12,%rdx movl %edx,%eax shrq $32,%rdx wrmsr movq %r13,%cr3 RESTORE_REGS addq $TF_RIP,%rsp jmp doreti_iret dbg_fromuserspace: /* * Switch to kernel GSBASE and kernel page table, and copy frame * from the IST stack to the normal kernel stack, since trap() * re-enables interrupts, and since we might trap on DB# while * in trap(). */ swapgs movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: movq PCPU(RSP0),%rax movl $TF_SIZE,%ecx subq %rcx,%rax movq %rax,%rdi movq %rsp,%rsi rep;movsb movq %rax,%rsp call handle_ibrs_entry movq PCPU(CURPCB),%rdi orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) jz 3f cmpw $KUF32SEL,TF_FS(%rsp) jne 2f rdfsbase %rax movq %rax,PCB_FSBASE(%rdi) 2: cmpw $KUG32SEL,TF_GS(%rsp) jne 3f movl $MSR_KGSBASE,%ecx rdmsr shlq $32,%rdx orq %rdx,%rax movq %rax,PCB_GSBASE(%rdi) 3: jmp calltrap /* * NMI handling is special. * * First, NMIs do not respect the state of the processor's RFLAGS.IF * bit. The NMI handler may be entered at any time, including when * the processor is in a critical section with RFLAGS.IF == 0. * The processor's GS.base value could be invalid on entry to the * handler. * * Second, the processor treats NMIs specially, blocking further NMIs * until an 'iretq' instruction is executed. We thus need to execute * the NMI handler with interrupts disabled, to prevent a nested interrupt * from executing an 'iretq' instruction and inadvertently taking the * processor out of NMI mode. * * Third, the NMI handler runs on its own stack (tss_ist2). The canonical * GS.base value for the processor is stored just above the bottom of its * NMI stack. For NMIs taken from kernel mode, the current value in * the processor's GS.base is saved at entry to C-preserved register %r12, * the canonical value for GS.base is then loaded into the processor, and * the saved value is restored at exit time. For NMIs taken from user mode, * the cheaper 'SWAPGS' instructions are used for swapping GS.base. */ IDTVEC(nmi) subq $TF_RIP,%rsp movl $(T_NMI),TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq xorl %ebx,%ebx testb $SEL_RPL_MASK,TF_CS(%rsp) jnz nmi_fromuserspace /* * We've interrupted the kernel. Preserve GS.base in %r12, * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d. */ movl $MSR_GSBASE,%ecx rdmsr movq %rax,%r12 shlq $32,%rdx orq %rdx,%r12 /* Retrieve and load the canonical value for GS.base. */ movq TF_SIZE(%rsp),%rdx movl %edx,%eax shrq $32,%rdx wrmsr movq %cr3,%r13 movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je nmi_calltrap movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr movl %eax,%r14d call handle_ibrs_entry jmp nmi_calltrap nmi_fromuserspace: incl %ebx swapgs movq %cr3,%r13 movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: call handle_ibrs_entry movq PCPU(CURPCB),%rdi testq %rdi,%rdi jz 3f orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) jz 3f cmpw $KUF32SEL,TF_FS(%rsp) jne 2f rdfsbase %rax movq %rax,PCB_FSBASE(%rdi) 2: cmpw $KUG32SEL,TF_GS(%rsp) jne 3f movl $MSR_KGSBASE,%ecx rdmsr shlq $32,%rdx orq %rdx,%rax movq %rax,PCB_GSBASE(%rdi) 3: /* Note: this label is also used by ddb and gdb: */ nmi_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp,%rdi call trap MEXITCOUNT #ifdef HWPMC_HOOKS /* * Capture a userspace callchain if needed. * * - Check if the current trap was from user mode. * - Check if the current thread is valid. * - Check if the thread requires a user call chain to be * captured. * * We are still in NMI mode at this point. */ testl %ebx,%ebx jz nocallchain /* not from userspace */ movq PCPU(CURTHREAD),%rax orq %rax,%rax /* curthread present? */ jz nocallchain /* * Move execution to the regular kernel stack, because we * committed to return through doreti. */ movq %rsp,%rsi /* source stack pointer */ movq $TF_SIZE,%rcx movq PCPU(RSP0),%rdx subq %rcx,%rdx movq %rdx,%rdi /* destination stack pointer */ shrq $3,%rcx /* trap frame size in long words */ pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq rep movsq /* copy trapframe */ movq %rdx,%rsp /* we are on the regular kstack */ testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */ jz nocallchain /* * A user callchain is to be captured, so: * - Take the processor out of "NMI" mode by faking an "iret", * to allow for nested NMI interrupts. * - Enable interrupts, so that copyin() can work. */ movl %ss,%eax pushq %rax /* tf_ss */ pushq %rdx /* tf_rsp (on kernel stack) */ pushfq /* tf_rflags */ movl %cs,%eax pushq %rax /* tf_cs */ pushq $outofnmi /* tf_rip */ iretq outofnmi: /* * At this point the processor has exited NMI mode and is running * with interrupts turned off on the normal kernel stack. * * If a pending NMI gets recognized at or after this point, it * will cause a kernel callchain to be traced. * * We turn interrupts back on, and call the user callchain capture hook. */ movq pmc_hook,%rax orq %rax,%rax jz nocallchain movq PCPU(CURTHREAD),%rdi /* thread */ movq $PMC_FN_USER_CALLCHAIN,%rsi /* command */ movq %rsp,%rdx /* frame */ sti call *%rax cli nocallchain: #endif testl %ebx,%ebx /* %ebx == 0 => return to userland */ jnz doreti_exit /* * Restore speculation control MSR, if preserved. */ testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je 1f movl %r14d,%eax xorl %edx,%edx movl $MSR_IA32_SPEC_CTRL,%ecx wrmsr /* * Put back the preserved MSR_GSBASE value. */ 1: movl $MSR_GSBASE,%ecx movq %r12,%rdx movl %edx,%eax shrq $32,%rdx wrmsr cmpb $0, nmi_flush_l1d_sw(%rip) je 2f call flush_l1d_sw /* bhyve L1TF assist */ 2: movq %r13,%cr3 RESTORE_REGS addq $TF_RIP,%rsp jmp doreti_iret /* * MC# handling is similar to NMI. * * As with NMIs, machine check exceptions do not respect RFLAGS.IF and * can occur at any time with a GS.base value that does not correspond * to the privilege level in CS. * * Machine checks are not unblocked by iretq, but it is best to run * the handler with interrupts disabled since the exception may have * interrupted a critical section. * * The MC# handler runs on its own stack (tss_ist3). The canonical * GS.base value for the processor is stored just above the bottom of * its MC# stack. For exceptions taken from kernel mode, the current * value in the processor's GS.base is saved at entry to C-preserved * register %r12, the canonical value for GS.base is then loaded into * the processor, and the saved value is restored at exit time. For * exceptions taken from user mode, the cheaper 'SWAPGS' instructions * are used for swapping GS.base. */ IDTVEC(mchk) subq $TF_RIP,%rsp movl $(T_MCHK),TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) pushfq andq $~(PSL_D | PSL_AC),(%rsp) popfq xorl %ebx,%ebx testb $SEL_RPL_MASK,TF_CS(%rsp) jnz mchk_fromuserspace /* * We've interrupted the kernel. Preserve GS.base in %r12, * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d. */ movl $MSR_GSBASE,%ecx rdmsr movq %rax,%r12 shlq $32,%rdx orq %rdx,%r12 /* Retrieve and load the canonical value for GS.base. */ movq TF_SIZE(%rsp),%rdx movl %edx,%eax shrq $32,%rdx wrmsr movq %cr3,%r13 movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je mchk_calltrap movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr movl %eax,%r14d call handle_ibrs_entry jmp mchk_calltrap mchk_fromuserspace: incl %ebx swapgs movq %cr3,%r13 movq PCPU(KCR3),%rax cmpq $~0,%rax je 1f movq %rax,%cr3 1: call handle_ibrs_entry /* Note: this label is also used by ddb and gdb: */ mchk_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp,%rdi call mca_intr MEXITCOUNT testl %ebx,%ebx /* %ebx == 0 => return to userland */ jnz doreti_exit /* * Restore speculation control MSR, if preserved. */ testl $CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip) je 1f movl %r14d,%eax xorl %edx,%edx movl $MSR_IA32_SPEC_CTRL,%ecx wrmsr /* * Put back the preserved MSR_GSBASE value. */ 1: movl $MSR_GSBASE,%ecx movq %r12,%rdx movl %edx,%eax shrq $32,%rdx wrmsr movq %r13,%cr3 RESTORE_REGS addq $TF_RIP,%rsp jmp doreti_iret ENTRY(fork_trampoline) movq %r12,%rdi /* function */ movq %rbx,%rsi /* arg1 */ movq %rsp,%rdx /* trapframe pointer */ call fork_exit MEXITCOUNT jmp doreti /* Handle any ASTs */ /* * To efficiently implement classification of trap and interrupt handlers * for profiling, there must be only trap handlers between the labels btrap * and bintr, and only interrupt handlers between the labels bintr and * eintr. This is implemented (partly) by including files that contain * some of the handlers. Before including the files, set up a normal asm * environment so that the included files doen't need to know that they are * included. */ #ifdef COMPAT_FREEBSD32 .data .p2align 4 .text SUPERALIGN_TEXT #include #endif .data .p2align 4 .text SUPERALIGN_TEXT MCOUNT_LABEL(bintr) #include #ifdef DEV_ATPIC .data .p2align 4 .text SUPERALIGN_TEXT #include #endif .text MCOUNT_LABEL(eintr) /* * void doreti(struct trapframe) * * Handle return from interrupts, traps and syscalls. */ .text SUPERALIGN_TEXT .type doreti,@function .globl doreti doreti: FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ /* * Check if ASTs can be handled now. */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* are we returning to user mode? */ jz doreti_exit /* can't handle ASTs now if not */ doreti_ast: /* * Check for ASTs atomically with returning. Disabling CPU * interrupts provides sufficient locking even in the SMP case, * since we will be informed of any new ASTs by an IPI. */ cli movq PCPU(CURTHREAD),%rax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) je doreti_exit sti movq %rsp,%rdi /* pass a pointer to the trapframe */ call ast jmp doreti_ast /* * doreti_exit: pop registers, iret. * * The segment register pop is a special case, since it may * fault if (for example) a sigreturn specifies bad segment * registers. The fault is handled in trap.c. */ doreti_exit: MEXITCOUNT movq PCPU(CURPCB),%r8 /* * Do not reload segment registers for kernel. * Since we do not reload segments registers with sane * values on kernel entry, descriptors referenced by * segments registers might be not valid. This is fatal * for user mode, but is not a problem for the kernel. */ testb $SEL_RPL_MASK,TF_CS(%rsp) jz ld_regs testl $PCB_FULL_IRET,PCB_FLAGS(%r8) jz ld_regs andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) testl $TF_HASSEGS,TF_FLAGS(%rsp) je set_segs do_segs: /* Restore %fs and fsbase */ movw TF_FS(%rsp),%ax .globl ld_fs ld_fs: movw %ax,%fs cmpw $KUF32SEL,%ax jne 1f movl $MSR_FSBASE,%ecx movl PCB_FSBASE(%r8),%eax movl PCB_FSBASE+4(%r8),%edx .globl ld_fsbase ld_fsbase: wrmsr 1: /* Restore %gs and gsbase */ movw TF_GS(%rsp),%si pushfq cli movl $MSR_GSBASE,%ecx /* Save current kernel %gs base into %r12d:%r13d */ rdmsr movl %eax,%r12d movl %edx,%r13d .globl ld_gs ld_gs: movw %si,%gs /* Save user %gs base into %r14d:%r15d */ rdmsr movl %eax,%r14d movl %edx,%r15d /* Restore kernel %gs base */ movl %r12d,%eax movl %r13d,%edx wrmsr popfq /* * Restore user %gs base, either from PCB if used for TLS, or * from the previously saved msr read. */ movl $MSR_KGSBASE,%ecx cmpw $KUG32SEL,%si jne 1f movl PCB_GSBASE(%r8),%eax movl PCB_GSBASE+4(%r8),%edx jmp ld_gsbase 1: movl %r14d,%eax movl %r15d,%edx .globl ld_gsbase ld_gsbase: wrmsr /* May trap if non-canonical, but only for TLS. */ .globl ld_es ld_es: movw TF_ES(%rsp),%es .globl ld_ds ld_ds: movw TF_DS(%rsp),%ds ld_regs: RESTORE_REGS testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 2f /* keep running with kernel GS.base */ cli call handle_ibrs_exit_rs + callq *mds_handler cmpq $~0,PCPU(UCR3) je 1f pushq %rdx movq PCPU(PTI_RSP0),%rdx subq $PTI_SIZE,%rdx movq %rax,PTI_RAX(%rdx) popq %rax movq %rax,PTI_RDX(%rdx) movq TF_RIP(%rsp),%rax movq %rax,PTI_RIP(%rdx) movq TF_CS(%rsp),%rax movq %rax,PTI_CS(%rdx) movq TF_RFLAGS(%rsp),%rax movq %rax,PTI_RFLAGS(%rdx) movq TF_RSP(%rsp),%rax movq %rax,PTI_RSP(%rdx) movq TF_SS(%rsp),%rax movq %rax,PTI_SS(%rdx) movq PCPU(UCR3),%rax swapgs movq %rdx,%rsp movq %rax,%cr3 popq %rdx popq %rax addq $8,%rsp jmp doreti_iret 1: swapgs 2: addq $TF_RIP,%rsp .globl doreti_iret doreti_iret: iretq set_segs: movw $KUDSEL,%ax movw %ax,TF_DS(%rsp) movw %ax,TF_ES(%rsp) movw $KUF32SEL,TF_FS(%rsp) movw $KUG32SEL,TF_GS(%rsp) jmp do_segs /* * doreti_iret_fault. Alternative return code for * the case where we get a fault in the doreti_exit code * above. trap() (amd64/amd64/trap.c) catches this specific * case, sends the process a signal and continues in the * corresponding place in the code below. */ ALIGN_TEXT .globl doreti_iret_fault doreti_iret_fault: subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */ movq %rax,TF_RAX(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) call handle_ibrs_entry testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) movq %r11,TF_R11(%rsp) movq %r12,TF_R12(%rsp) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) movl $T_PROTFLT,TF_TRAPNO(%rsp) movq $0,TF_ERR(%rsp) /* XXX should be the error code */ movq $0,TF_ADDR(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) jmp calltrap ALIGN_TEXT .globl ds_load_fault ds_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: movq %rsp,%rdi call trap movw $KUDSEL,TF_DS(%rsp) jmp doreti ALIGN_TEXT .globl es_load_fault es_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti 1: movq %rsp,%rdi call trap movw $KUDSEL,TF_ES(%rsp) jmp doreti ALIGN_TEXT .globl fs_load_fault fs_load_fault: testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti 1: movl $T_PROTFLT,TF_TRAPNO(%rsp) movq %rsp,%rdi call trap movw $KUF32SEL,TF_FS(%rsp) jmp doreti ALIGN_TEXT .globl gs_load_fault gs_load_fault: popfq movl $T_PROTFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti 1: movq %rsp,%rdi call trap movw $KUG32SEL,TF_GS(%rsp) jmp doreti ALIGN_TEXT .globl fsbase_load_fault fsbase_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti 1: movq %rsp,%rdi call trap movq PCPU(CURTHREAD),%r8 movq TD_PCB(%r8),%r8 movq $0,PCB_FSBASE(%r8) jmp doreti ALIGN_TEXT .globl gsbase_load_fault gsbase_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti 1: movq %rsp,%rdi call trap movq PCPU(CURTHREAD),%r8 movq TD_PCB(%r8),%r8 movq $0,PCB_GSBASE(%r8) jmp doreti #ifdef HWPMC_HOOKS ENTRY(end_exceptions) #endif Index: projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/genassym.c (revision 347599) @@ -1,271 +1,274 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd)); ASSYM(MD_EFIRT_TMP, offsetof(struct mdthread, md_efirt_tmp)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_FRAME, offsetof(struct thread, td_frame)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(addr_PTmap, addr_PTmap); ASSYM(addr_PDmap, addr_PDmap); ASSYM(addr_PDPmap, addr_PDPmap); ASSYM(addr_PML4map, addr_PML4map); ASSYM(addr_PML4pml4e, addr_PML4pml4e); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(PDPSHIFT, PDPSHIFT); ASSYM(PML4SHIFT, PML4SHIFT); ASSYM(val_KPDPI, KPDPI); ASSYM(val_KPML4I, KPML4I); ASSYM(val_PML4PML4I, PML4PML4I); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS); ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS); ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15)); ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14)); ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13)); ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12)); ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp)); ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp)); ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx)); ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip)); ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase)); ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase)); ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase)); ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0)); ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2)); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt)); ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt)); ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_SAVED_UCR3, offsetof(struct pcb, pcb_saved_ucr3)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer)); ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star)); ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar)); ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar)); ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_FULL_IRET, PCB_FULL_IRET); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); ASSYM(TSS_RSP0, offsetof(struct amd64tss, tss_rsp0)); ASSYM(TF_R15, offsetof(struct trapframe, tf_r15)); ASSYM(TF_R14, offsetof(struct trapframe, tf_r14)); ASSYM(TF_R13, offsetof(struct trapframe, tf_r13)); ASSYM(TF_R12, offsetof(struct trapframe, tf_r12)); ASSYM(TF_R11, offsetof(struct trapframe, tf_r11)); ASSYM(TF_R10, offsetof(struct trapframe, tf_r10)); ASSYM(TF_R9, offsetof(struct trapframe, tf_r9)); ASSYM(TF_R8, offsetof(struct trapframe, tf_r8)); ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi)); ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi)); ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp)); ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx)); ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx)); ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx)); ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax)); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags)); ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp)); ASSYM(TF_SS, offsetof(struct trapframe, tf_ss)); ASSYM(TF_DS, offsetof(struct trapframe, tf_ds)); ASSYM(TF_ES, offsetof(struct trapframe, tf_es)); ASSYM(TF_FS, offsetof(struct trapframe, tf_fs)); ASSYM(TF_GS, offsetof(struct trapframe, tf_gs)); ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_HASSEGS, TF_HASSEGS); ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx)); ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax)); ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err)); ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip)); ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs)); ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags)); ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp)); ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss)); ASSYM(PTI_SIZE, sizeof(struct pti_frame)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p)); ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3)); ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3)); ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3)); ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack)); ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ); ASSYM(PC_PTI_RSP0, offsetof(struct pcpu, pc_pti_rsp0)); ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set)); +ASSYM(PC_MDS_TMP, offsetof(struct pcpu, pc_mds_tmp)); +ASSYM(PC_MDS_BUF, offsetof(struct pcpu, pc_mds_buf)); +ASSYM(PC_MDS_BUF64, offsetof(struct pcpu, pc_mds_buf64)); ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL)); ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL)); ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL)); ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL)); ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL)); ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL)); ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL)); ASSYM(SEL_RPL_MASK, SEL_RPL_MASK); ASSYM(__FreeBSD_version, __FreeBSD_version); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif ASSYM(EC_EFI_STATUS, offsetof(struct efirt_callinfo, ec_efi_status)); ASSYM(EC_FPTR, offsetof(struct efirt_callinfo, ec_fptr)); ASSYM(EC_ARGCNT, offsetof(struct efirt_callinfo, ec_argcnt)); ASSYM(EC_ARG1, offsetof(struct efirt_callinfo, ec_arg1)); ASSYM(EC_ARG2, offsetof(struct efirt_callinfo, ec_arg2)); ASSYM(EC_ARG3, offsetof(struct efirt_callinfo, ec_arg3)); ASSYM(EC_ARG4, offsetof(struct efirt_callinfo, ec_arg4)); ASSYM(EC_ARG5, offsetof(struct efirt_callinfo, ec_arg5)); ASSYM(EC_RBX, offsetof(struct efirt_callinfo, ec_rbx)); ASSYM(EC_RSP, offsetof(struct efirt_callinfo, ec_rsp)); ASSYM(EC_RBP, offsetof(struct efirt_callinfo, ec_rbp)); ASSYM(EC_R12, offsetof(struct efirt_callinfo, ec_r12)); ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13)); ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14)); ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15)); Index: projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/initcpu.c (revision 347599) @@ -1,305 +1,306 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) KATO Takenori, 1997, 1998. * * All rights reserved. Unpublished rights reserved under the copyright * laws of Japan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); static int lower_sharedpage_init; int hw_lower_amd64_sharedpage; SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN, &hw_lower_amd64_sharedpage, 0, "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory"); /* * -1: automatic (default) * 0: keep enable CLFLUSH * 1: force disable CLFLUSH */ static int hw_clflush_disable = -1; static void init_amd(void) { uint64_t msr; /* * Work around Erratum 721 for Family 10h and 12h processors. * These processors may incorrectly update the stack pointer * after a long series of push and/or near-call instructions, * or a long series of pop and/or near-return instructions. * * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf * * Hypervisors do not provide access to the errata MSR, * causing #GP exception on attempt to apply the errata. The * MSR write shall be done on host and persist globally * anyway, so do not try to do it when under virtualization. */ switch (CPUID_TO_FAMILY(cpu_id)) { case 0x10: case 0x12: if ((cpu_feature2 & CPUID2_HV) == 0) wrmsr(0xc0011029, rdmsr(0xc0011029) | 1); break; } /* * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG. * So, do it here or otherwise some tools could be confused by * Initial Local APIC ID reported with CPUID Function 1 in EBX. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(MSR_NB_CFG1); msr |= (uint64_t)1 << 54; wrmsr(MSR_NB_CFG1, msr); } } /* * BIOS may configure Family 10h processors to convert WC+ cache type * to CD. That can hurt performance of guest VMs using nested paging. * The relevant MSR bit is not documented in the BKDG, * the fix is borrowed from Linux. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(0xc001102a); msr &= ~((uint64_t)1 << 24); wrmsr(0xc001102a, msr); } } /* * Work around Erratum 793: Specific Combination of Writes to Write * Combined Memory Types and Locked Instructions May Cause Core Hang. * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors, * revision 3.04 or later, publication 51810. */ if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(0xc0011020); msr |= (uint64_t)1 << 15; wrmsr(0xc0011020, msr); } } /* Ryzen erratas. */ if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 && (cpu_feature2 & CPUID2_HV) == 0) { /* 1021 */ msr = rdmsr(0xc0011029); msr |= 0x2000; wrmsr(0xc0011029, msr); /* 1033 */ msr = rdmsr(0xc0011020); msr |= 0x10; wrmsr(0xc0011020, msr); /* 1049 */ msr = rdmsr(0xc0011028); msr |= 0x10; wrmsr(0xc0011028, msr); /* 1095 */ msr = rdmsr(0xc0011020); msr |= 0x200000000000000; wrmsr(0xc0011020, msr); } /* * Work around a problem on Ryzen that is triggered by executing * code near the top of user memory, in our case the signal * trampoline code in the shared page on amd64. * * This function is executed once for the BSP before tunables take * effect so the value determined here can be overridden by the * tunable. This function is then executed again for each AP and * also on resume. Set a flag the first time so that value set by * the tunable is not overwritten. * * The stepping and/or microcode versions should be checked after * this issue is fixed by AMD so that we don't use this mode if not * needed. */ if (lower_sharedpage_init == 0) { lower_sharedpage_init = 1; if (CPUID_TO_FAMILY(cpu_id) == 0x17) { hw_lower_amd64_sharedpage = 1; } } } /* * Initialize special VIA features */ static void init_via(void) { u_int regs[4], val; /* * Check extended CPUID for PadLock features. * * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf */ do_cpuid(0xc0000000, regs); if (regs[0] >= 0xc0000001) { do_cpuid(0xc0000001, regs); val = regs[3]; } else return; /* Enable RNG if present. */ if ((val & VIA_CPUID_HAS_RNG) != 0) { via_feature_rng = VIA_HAS_RNG; wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG); } /* Enable PadLock if present. */ if ((val & VIA_CPUID_HAS_ACE) != 0) via_feature_xcrypt |= VIA_HAS_AES; if ((val & VIA_CPUID_HAS_ACE2) != 0) via_feature_xcrypt |= VIA_HAS_AESCTR; if ((val & VIA_CPUID_HAS_PHE) != 0) via_feature_xcrypt |= VIA_HAS_SHA; if ((val & VIA_CPUID_HAS_PMM) != 0) via_feature_xcrypt |= VIA_HAS_MM; if (via_feature_xcrypt != 0) wrmsr(0x1107, rdmsr(0x1107) | (1 << 28)); } /* * Initialize CPU control registers */ void initializecpu(void) { uint64_t msr; uint32_t cr4; cr4 = rcr4(); if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { cr4 |= CR4_FXSR | CR4_XMM; cpu_fxsr = hw_instruction_sse = 1; } if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) cr4 |= CR4_FSGSBASE; if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) cr4 |= CR4_PKE; /* * Postpone enabling the SMEP on the boot CPU until the page * tables are switched from the boot loader identity mapping * to the kernel tables. The boot loader enables the U bit in * its tables. */ if (!IS_BSP()) { if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; } load_cr4(cr4); if (IS_BSP() && (amd_feature & AMDID_NX) != 0) { msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); pg_nx = PG_NX; } hw_ibrs_recalculate(); hw_ssb_recalculate(false); amd64_syscall_ret_flush_l1d_recalc(); + hw_mds_recalculate(); switch (cpu_vendor_id) { case CPU_VENDOR_AMD: init_amd(); break; case CPU_VENDOR_CENTAUR: init_via(); break; } if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid)); } void initializecpucache(void) { /* * CPUID with %eax = 1, %ebx returns * Bits 15-8: CLFLUSH line size * (Value * 8 = cache line size in bytes) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; /* * XXXKIB: (temporary) hack to work around traps generated * when CLFLUSHing APIC register window under virtualization * environments. These environments tend to disable the * CPUID_SS feature even though the native CPU supports it. */ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } /* * The kernel's use of CLFLUSH{,OPT} can be disabled manually * by setting the hw.clflush_disable tunable. */ if (hw_clflush_disable == 1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } } Index: projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/machdep.c (revision 347599) @@ -1,2740 +1,2741 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_pci.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); /* * The PTI trampoline stack needs enough space for a hardware trapframe and a * couple of scratch registers, as well as the trapframe left behind after an * iret fault. */ CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - offsetof(struct pti_frame, pti_rip)); extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = i8254_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif #ifdef DEV_PCI .msi_init = msi_init, #endif }; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_gdt, r_idt; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); #ifdef DEV_PCI if (bootverbose && intel_graphics_stolen_base != 0) printf("intel stolen mem: base %#jx size %ju MB\n", (uintmax_t)intel_graphics_stolen_base, (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); #endif /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); static char mce0_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); static char dbg0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); struct amd64tss common_tss[MAXCPU]; /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(div_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32), IDTVEC(fast_syscall_pti); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include #include /* * Return a bitmap of the current interrupt requests. This is 8259-specific * and is only suitable for use at probe time. * This is only here to pacify sio. It is NOT FATAL if this doesn't work. * It shouldn't be here. There should probably be an APIC centric * implementation in the apic driver code, if at all. */ intrmask_t isa_irq_pending(void) { u_char irr1; u_char irr2; irr1 = inb(IO_ICU1); irr2 = inb(IO_ICU2); return ((irr2 << 8) | irr1); } #endif u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idx)) break; } } static char bootmethod[16] = ""; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * Make hole for "AP -> long mode" bootstrap code. The * mp_bootaddress vector is only available when the kernel * is configured to support APs and APs for the system start * in real mode mode (e.g. SMP bare metal). */ if (init_ops.mp_bootaddress) init_ops.mp_bootaddress(physmap, &physmap_idx); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* Set up the fast syscall stuff */ void amd64_conf_fast_syscall(void) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; struct nmi_pcpu *np; struct xstate_hdr *xhdr; u_int64_t rsp0; char *env; size_t kstack0_sz; int late_console; TSRAW(&thread0, TS_ENTER, __func__, NULL); kmdp = init_ops.parse_preload_data(modulep); physfree += ucode_load_bsp(physfree + KERNBASE); physfree = roundup2(physfree, PAGE_SIZE); identify_cpu1(); identify_hypervisor(); identify_cpu_fixup_bsp(); identify_cpu2(); initializecpucache(); /* * Check for pti, pcid, and invpcid before ifuncs are * resolved, to correctly select the implementation for * pmap_activate_sw_mode(). */ pti = pti_get_default(); TUNABLE_INT_FETCH("vm.pmap.pti", &pti); TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; } else { pmap_pcid_enabled = 0; } link_elf_ireloc(kmdp); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree + KERNBASE; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) gdt; lgdt(&r_gdt); pc = &__pcpu[0]; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ pcpu_init(pc, 0, sizeof(struct pcpu)); dpcpu_init((void *)(physfree + KERNBASE), 0); physfree += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ PCPU_SET(tssp, &common_tss[0]); PCPU_SET(commontssp, &common_tss[0]); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL) vty_set_preferred(VTY_VT); TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", &syscall_ret_l1d_flush_mode); + TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); finishidentcpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ /* doublefault stack space, runs on ist1 */ common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t) pc; common_tss[0].tss_ist2 = (long) np; /* * MC# stack, runs on ist3. The pcpu pointer is stored just * above the start of the ist3 stack. */ np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; np->np_pcpu = (register_t) pc; common_tss[0].tss_ist3 = (long) np; /* * DB# stack, runs on ist4. */ np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; np->np_pcpu = (register_t) pc; common_tss[0].tss_ist4 = (long) np; /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); amd64_conf_fast_syscall(); /* * Temporary forge some valid pointer to PCB, for exception * handlers. It is reinitialized properly below after FPU is * set up. Also set up td_critnest to short-cut the page * fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ #ifdef DEV_PCI /* This call might adjust phys_avail[]. */ pci_early_quirks(); #endif if (late_console) cninit(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?"; #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* * Set up thread0 pcb after fpuinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = (vm_offset_t)thread0.td_pcb; /* Ensure the stack is aligned to 16 bytes */ rsp0 &= ~0xFul; common_tss[0].tss_rsp0 = rsp0; PCPU_SET(rsp0, rsp0); PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); PCPU_SET(curpcb, thread0.td_pcb); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; TSEXIT(); /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } int ptrace_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if ((td->td_frame->tf_rflags & PSL_T) == 0) { td->td_frame->tf_rflags |= PSL_T; td->td_dbgflags |= TDB_STEP; } return (0); } int ptrace_clear_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } regs->r_err = 0; regs->r_trapno = 0; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { critical_enter(); set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); critical_exit(); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(register_t dr6) { u_int64_t dr7; u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; bp = dr6 & DBREG_DR6_BMASK; if (bp == 0) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ static void set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int), static) { return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? set_pcb_flags_fsgsbase : set_pcb_flags_raw); } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ #undef memset #undef memmove #undef memcpy void *memset_std(void *buf, int c, size_t len); void *memset_erms(void *buf, int c, size_t len); DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memset_erms : memset_std); } void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t), static) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memmove_erms : memmove_std); } void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t), static) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } void pagezero_std(void *addr); void pagezero_erms(void *addr); DEFINE_IFUNC(, void , pagezero, (void *), static) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? pagezero_erms : pagezero_std); } Index: projects/runtime-coverage-v2/sys/amd64/amd64/support.S =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/support.S (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/support.S (revision 347599) @@ -1,1624 +1,1865 @@ /*- + * Copyright (c) 2018-2019 The FreeBSD Foundation * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_ddb.h" #include #include #include #include "assym.inc" .text /* Address: %rdi */ ENTRY(pagezero_std) PUSH_FRAME_POINTER movl $PAGE_SIZE/8,%ecx xorl %eax,%eax rep stosq POP_FRAME_POINTER ret END(pagezero_std) ENTRY(pagezero_erms) PUSH_FRAME_POINTER movl $PAGE_SIZE,%ecx xorl %eax,%eax rep stosb POP_FRAME_POINTER ret END(pagezero_erms) /* * pagecopy(%rdi=from, %rsi=to) */ ENTRY(pagecopy) PUSH_FRAME_POINTER movl $PAGE_SIZE/8,%ecx movq %rdi,%r9 movq %rsi,%rdi movq %r9,%rsi rep movsq POP_FRAME_POINTER ret END(pagecopy) /* Address: %rdi */ ENTRY(sse2_pagezero) PUSH_FRAME_POINTER movq $-PAGE_SIZE,%rdx subq %rdx,%rdi xorl %eax,%eax jmp 1f /* * The loop takes 29 bytes. Ensure that it doesn't cross a 32-byte * cache line. */ .p2align 5,0x90 1: movnti %rax,(%rdi,%rdx) movnti %rax,8(%rdi,%rdx) movnti %rax,16(%rdi,%rdx) movnti %rax,24(%rdi,%rdx) addq $32,%rdx jne 1b sfence POP_FRAME_POINTER ret END(sse2_pagezero) /* * memcmpy(b1, b2, len) * rdi,rsi,len */ ENTRY(memcmp) PUSH_FRAME_POINTER cmpq $16,%rdx jae 5f 1: testq %rdx,%rdx je 3f xorl %ecx,%ecx 2: movzbl (%rdi,%rcx,1),%eax movzbl (%rsi,%rcx,1),%r8d cmpb %r8b,%al jne 4f addq $1,%rcx cmpq %rcx,%rdx jz 3f movzbl (%rdi,%rcx,1),%eax movzbl (%rsi,%rcx,1),%r8d cmpb %r8b,%al jne 4f addq $1,%rcx cmpq %rcx,%rdx jz 3f movzbl (%rdi,%rcx,1),%eax movzbl (%rsi,%rcx,1),%r8d cmpb %r8b,%al jne 4f addq $1,%rcx cmpq %rcx,%rdx jz 3f movzbl (%rdi,%rcx,1),%eax movzbl (%rsi,%rcx,1),%r8d cmpb %r8b,%al jne 4f addq $1,%rcx cmpq %rcx,%rdx jne 2b 3: xorl %eax,%eax POP_FRAME_POINTER ret 4: subl %r8d,%eax POP_FRAME_POINTER ret 5: cmpq $32,%rdx jae 7f 6: /* * 8 bytes */ movq (%rdi),%r8 movq (%rsi),%r9 cmpq %r8,%r9 jne 1b leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi subq $8,%rdx cmpq $8,%rdx jae 6b jl 1b jmp 3b 7: /* * 32 bytes */ movq (%rsi),%r8 movq 8(%rsi),%r9 subq (%rdi),%r8 subq 8(%rdi),%r9 or %r8,%r9 jnz 1b movq 16(%rsi),%r8 movq 24(%rsi),%r9 subq 16(%rdi),%r8 subq 24(%rdi),%r9 or %r8,%r9 jnz 1b leaq 32(%rdi),%rdi leaq 32(%rsi),%rsi subq $32,%rdx cmpq $32,%rdx jae 7b jnz 1b jmp 3b END(memcmp) /* * memmove(dst, src, cnt) * rdi, rsi, rdx */ /* * Register state at entry is supposed to be as follows: * rdi - destination * rsi - source * rdx - count * * The macro possibly clobbers the above and: rcx, r8, r9, r10 * It does not clobber rax nor r11. */ .macro MEMMOVE erms overlap begin end \begin /* * For sizes 0..32 all data is read before it is written, so there * is no correctness issue with direction of copying. */ cmpq $32,%rcx jbe 101632f .if \overlap == 1 movq %rdi,%r8 subq %rsi,%r8 cmpq %rcx,%r8 /* overlapping && src < dst? */ jb 2f .endif cmpq $256,%rcx ja 1256f 103200: movq (%rsi),%rdx movq %rdx,(%rdi) movq 8(%rsi),%rdx movq %rdx,8(%rdi) movq 16(%rsi),%rdx movq %rdx,16(%rdi) movq 24(%rsi),%rdx movq %rdx,24(%rdi) leaq 32(%rsi),%rsi leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx jae 103200b cmpb $0,%cl jne 101632f \end ret ALIGN_TEXT 101632: cmpb $16,%cl jl 100816f movq (%rsi),%rdx movq 8(%rsi),%r8 movq -16(%rsi,%rcx),%r9 movq -8(%rsi,%rcx),%r10 movq %rdx,(%rdi) movq %r8,8(%rdi) movq %r9,-16(%rdi,%rcx) movq %r10,-8(%rdi,%rcx) \end ret ALIGN_TEXT 100816: cmpb $8,%cl jl 100408f movq (%rsi),%rdx movq -8(%rsi,%rcx),%r8 movq %rdx,(%rdi) movq %r8,-8(%rdi,%rcx,) \end ret ALIGN_TEXT 100408: cmpb $4,%cl jl 100204f movl (%rsi),%edx movl -4(%rsi,%rcx),%r8d movl %edx,(%rdi) movl %r8d,-4(%rdi,%rcx) \end ret ALIGN_TEXT 100204: cmpb $2,%cl jl 100001f movzwl (%rsi),%edx movzwl -2(%rsi,%rcx),%r8d movw %dx,(%rdi) movw %r8w,-2(%rdi,%rcx) \end ret ALIGN_TEXT 100001: cmpb $1,%cl jl 100000f movb (%rsi),%dl movb %dl,(%rdi) 100000: \end ret ALIGN_TEXT 1256: testb $15,%dil jnz 100f .if \erms == 1 rep movsb .else shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andl $7,%ecx /* any bytes left? */ jne 100408b .endif \end ret 100: movq (%rsi),%r8 movq 8(%rsi),%r9 movq %rdi,%r10 movq %rdi,%rcx andq $15,%rcx leaq -16(%rdx,%rcx),%rdx neg %rcx leaq 16(%rdi,%rcx),%rdi leaq 16(%rsi,%rcx),%rsi movq %rdx,%rcx .if \erms == 1 rep movsb movq %r8,(%r10) movq %r9,8(%r10) .else shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %r8,(%r10) movq %r9,8(%r10) movq %rdx,%rcx andl $7,%ecx /* any bytes left? */ jne 100408b .endif \end ret .if \overlap == 1 /* * Copy backwards. */ ALIGN_TEXT 2: cmpq $256,%rcx ja 2256f leaq -8(%rdi,%rcx),%rdi leaq -8(%rsi,%rcx),%rsi cmpq $32,%rcx jb 2016f 2032: movq (%rsi),%rdx movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) movq -16(%rsi),%rdx movq %rdx,-16(%rdi) movq -24(%rsi),%rdx movq %rdx,-24(%rdi) leaq -32(%rsi),%rsi leaq -32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx jae 2032b cmpb $0,%cl jne 2016f \end ret ALIGN_TEXT 2016: cmpb $16,%cl jl 2008f movq (%rsi),%rdx movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) subb $16,%cl jz 2000f leaq -16(%rsi),%rsi leaq -16(%rdi),%rdi 2008: cmpb $8,%cl jl 2004f movq (%rsi),%rdx movq %rdx,(%rdi) subb $8,%cl jz 2000f leaq -8(%rsi),%rsi leaq -8(%rdi),%rdi 2004: cmpb $4,%cl jl 2002f movl 4(%rsi),%edx movl %edx,4(%rdi) subb $4,%cl jz 2000f leaq -4(%rsi),%rsi leaq -4(%rdi),%rdi 2002: cmpb $2,%cl jl 2001f movw 6(%rsi),%dx movw %dx,6(%rdi) subb $2,%cl jz 2000f leaq -2(%rsi),%rsi leaq -2(%rdi),%rdi 2001: cmpb $1,%cl jl 2000f movb 7(%rsi),%dl movb %dl,7(%rdi) 2000: \end ret ALIGN_TEXT 2256: std .if \erms == 1 leaq -1(%rdi,%rcx),%rdi leaq -1(%rsi,%rcx),%rsi rep movsb cld .else leaq -8(%rdi,%rcx),%rdi leaq -8(%rsi,%rcx),%rsi shrq $3,%rcx rep movsq cld movq %rdx,%rcx andb $7,%cl jne 2004b .endif \end ret .endif .endm .macro MEMMOVE_BEGIN PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx .endm .macro MEMMOVE_END POP_FRAME_POINTER .endm ENTRY(memmove_std) MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memmove_std) ENTRY(memmove_erms) MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memmove_erms) /* * memcpy(dst, src, len) * rdi, rsi, rdx * * Note: memcpy does not support overlapping copies */ ENTRY(memcpy_std) MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memcpy_std) ENTRY(memcpy_erms) MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END END(memcpy_erms) /* * memset(dst, c, len) * rdi, rsi, rdx */ .macro MEMSET erms PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx movzbq %sil,%r8 movabs $0x0101010101010101,%r10 imulq %r8,%r10 cmpq $32,%rcx jbe 101632f cmpq $256,%rcx ja 1256f 103200: movq %r10,(%rdi) movq %r10,8(%rdi) movq %r10,16(%rdi) movq %r10,24(%rdi) leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx ja 103200b cmpb $16,%cl ja 201632f movq %r10,-16(%rdi,%rcx) movq %r10,-8(%rdi,%rcx) POP_FRAME_POINTER ret ALIGN_TEXT 101632: cmpb $16,%cl jl 100816f 201632: movq %r10,(%rdi) movq %r10,8(%rdi) movq %r10,-16(%rdi,%rcx) movq %r10,-8(%rdi,%rcx) POP_FRAME_POINTER ret ALIGN_TEXT 100816: cmpb $8,%cl jl 100408f movq %r10,(%rdi) movq %r10,-8(%rdi,%rcx) POP_FRAME_POINTER ret ALIGN_TEXT 100408: cmpb $4,%cl jl 100204f movl %r10d,(%rdi) movl %r10d,-4(%rdi,%rcx) POP_FRAME_POINTER ret ALIGN_TEXT 100204: cmpb $2,%cl jl 100001f movw %r10w,(%rdi) movw %r10w,-2(%rdi,%rcx) POP_FRAME_POINTER ret ALIGN_TEXT 100001: cmpb $0,%cl je 100000f movb %r10b,(%rdi) 100000: POP_FRAME_POINTER ret ALIGN_TEXT 1256: movq %rdi,%r9 movq %r10,%rax testl $15,%edi jnz 3f 1: .if \erms == 1 rep stosb movq %r9,%rax .else movq %rcx,%rdx shrq $3,%rcx rep stosq movq %r9,%rax andl $7,%edx jnz 2f POP_FRAME_POINTER ret 2: movq %r10,-8(%rdi,%rdx) .endif POP_FRAME_POINTER ret ALIGN_TEXT 3: movq %r10,(%rdi) movq %r10,8(%rdi) movq %rdi,%r8 andq $15,%r8 leaq -16(%rcx,%r8),%rcx neg %r8 leaq 16(%rdi,%r8),%rdi jmp 1b .endm ENTRY(memset_std) MEMSET erms=0 END(memset_std) ENTRY(memset_erms) MEMSET erms=1 END(memset_erms) /* fillw(pat, base, cnt) */ /* %rdi,%rsi, %rdx */ ENTRY(fillw) PUSH_FRAME_POINTER movq %rdi,%rax movq %rsi,%rdi movq %rdx,%rcx rep stosw POP_FRAME_POINTER ret END(fillw) /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines should be * the only places that do this. * * These routines set curpcb->pcb_onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->pcb_onfault instead of the function. */ .macro SMAP_DISABLE smap .if \smap stac .endif .endm .macro SMAP_ENABLE smap .if \smap clac .endif .endm .macro COPYINOUT_BEGIN .endm .macro COPYINOUT_END movq %rax,PCB_ONFAULT(%r11) POP_FRAME_POINTER .endm .macro COPYINOUT_SMAP_END SMAP_ENABLE smap=1 COPYINOUT_END .endm /* * copyout(from_kernel, to_user, len) * %rdi, %rsi, %rdx */ .macro COPYOUT smap erms PUSH_FRAME_POINTER movq PCPU(CURPCB),%r11 movq $copy_fault,PCB_ONFAULT(%r11) /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movq %rsi,%rax addq %rdx,%rax jc copy_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax ja copy_fault /* * Set return value to zero. Remaining failure mode goes through * copy_fault. */ xorl %eax,%eax /* * Set up arguments for MEMMOVE. */ movq %rdi,%r8 movq %rsi,%rdi movq %r8,%rsi movq %rdx,%rcx SMAP_DISABLE \smap .if \smap == 1 MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END .else MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END .endif /* NOTREACHED */ .endm ENTRY(copyout_nosmap_std) COPYOUT smap=0 erms=0 END(copyout_nosmap_std) ENTRY(copyout_smap_std) COPYOUT smap=1 erms=0 END(copyout_smap_std) ENTRY(copyout_nosmap_erms) COPYOUT smap=0 erms=1 END(copyout_nosmap_erms) ENTRY(copyout_smap_erms) COPYOUT smap=1 erms=1 END(copyout_smap_erms) /* * copyin(from_user, to_kernel, len) * %rdi, %rsi, %rdx */ .macro COPYIN smap erms PUSH_FRAME_POINTER movq PCPU(CURPCB),%r11 movq $copy_fault,PCB_ONFAULT(%r11) /* * make sure address is valid */ movq %rdi,%rax addq %rdx,%rax jc copy_fault movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax ja copy_fault xorl %eax,%eax movq %rdi,%r8 movq %rsi,%rdi movq %r8,%rsi movq %rdx,%rcx SMAP_DISABLE \smap .if \smap == 1 MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END .else MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END .endif /* NOTREACHED */ .endm ENTRY(copyin_nosmap_std) COPYIN smap=0 erms=0 END(copyin_nosmap_std) ENTRY(copyin_smap_std) COPYIN smap=1 erms=0 END(copyin_smap_std) ENTRY(copyin_nosmap_erms) COPYIN smap=0 erms=1 END(copyin_nosmap_erms) ENTRY(copyin_smap_erms) COPYIN smap=1 erms=1 END(copyin_smap_erms) ALIGN_TEXT /* Trap entry clears PSL.AC */ copy_fault: movq $0,PCB_ONFAULT(%r11) movl $EFAULT,%eax POP_FRAME_POINTER ret /* * casueword32. Compare and set user integer. Returns -1 on fault, * 0 if access was successful. Old value is written to *oldp. * dst = %rdi, old = %esi, oldp = %rdx, new = %ecx */ ENTRY(casueword32_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movl %esi,%eax /* old */ #ifdef SMP lock #endif cmpxchgl %ecx,(%rdi) /* new = %ecx */ /* * The old value is in %eax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. Save %eax into %esi to prepare the return * value. */ movl %eax,%esi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) /* * Access the oldp after the pcb_onfault is cleared, to correctly * catch corrupted pointer. */ movl %esi,(%rdx) /* oldp = %rdx */ POP_FRAME_POINTER ret END(casueword32_nosmap) ENTRY(casueword32_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movl %esi,%eax /* old */ stac #ifdef SMP lock #endif cmpxchgl %ecx,(%rdi) /* new = %ecx */ clac /* * The old value is in %eax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. Save %eax into %esi to prepare the return * value. */ movl %eax,%esi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) /* * Access the oldp after the pcb_onfault is cleared, to correctly * catch corrupted pointer. */ movl %esi,(%rdx) /* oldp = %rdx */ POP_FRAME_POINTER ret END(casueword32_smap) /* * casueword. Compare and set user long. Returns -1 on fault, * 0 if access was successful. Old value is written to *oldp. * dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx */ ENTRY(casueword_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movq %rsi,%rax /* old */ #ifdef SMP lock #endif cmpxchgq %rcx,(%rdi) /* new = %rcx */ /* * The old value is in %rax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. */ movq %rax,%rsi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) movq %rsi,(%rdx) POP_FRAME_POINTER ret END(casueword_nosmap) ENTRY(casueword_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movq %rsi,%rax /* old */ stac #ifdef SMP lock #endif cmpxchgq %rcx,(%rdi) /* new = %rcx */ clac /* * The old value is in %rax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. */ movq %rax,%rsi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) movq %rsi,(%rdx) POP_FRAME_POINTER ret END(casueword_smap) /* * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit * byte from user memory. * addr = %rdi, valp = %rsi */ ENTRY(fueword_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax movq (%rdi),%r11 movq %rax,PCB_ONFAULT(%rcx) movq %r11,(%rsi) POP_FRAME_POINTER ret END(fueword_nosmap) ENTRY(fueword_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax stac movq (%rdi),%r11 clac movq %rax,PCB_ONFAULT(%rcx) movq %r11,(%rsi) POP_FRAME_POINTER ret END(fueword_smap) ENTRY(fueword32_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax movl (%rdi),%r11d movq %rax,PCB_ONFAULT(%rcx) movl %r11d,(%rsi) POP_FRAME_POINTER ret END(fueword32_nosmap) ENTRY(fueword32_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax stac movl (%rdi),%r11d clac movq %rax,PCB_ONFAULT(%rcx) movl %r11d,(%rsi) POP_FRAME_POINTER ret END(fueword32_smap) ENTRY(fuword16_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi ja fusufault movzwl (%rdi),%eax movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fuword16_nosmap) ENTRY(fuword16_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi ja fusufault stac movzwl (%rdi),%eax clac movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fuword16_smap) ENTRY(fubyte_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi ja fusufault movzbl (%rdi),%eax movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fubyte_nosmap) ENTRY(fubyte_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi ja fusufault stac movzbl (%rdi),%eax clac movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fubyte_smap) /* * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to * user memory. * addr = %rdi, value = %rsi */ ENTRY(suword_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movq %rsi,(%rdi) xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword_nosmap) ENTRY(suword_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault stac movq %rsi,(%rdi) clac xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword_smap) ENTRY(suword32_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movl %esi,(%rdi) xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword32_nosmap) ENTRY(suword32_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault stac movl %esi,(%rdi) clac xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword32_smap) ENTRY(suword16_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movw %si,(%rdi) xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword16_nosmap) ENTRY(suword16_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault stac movw %si,(%rdi) clac xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword16_smap) ENTRY(subyte_nosmap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movl %esi,%eax movb %al,(%rdi) xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(subyte_nosmap) ENTRY(subyte_smap) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movl %esi,%eax stac movb %al,(%rdi) clac xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(subyte_smap) ALIGN_TEXT /* Fault entry clears PSL.AC */ fusufault: movq PCPU(CURPCB),%rcx xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) decq %rax POP_FRAME_POINTER ret /* * copyinstr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx * * copy a string from 'from' to 'to', stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ .macro COPYINSTR smap PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ movq PCPU(CURPCB),%r9 movq $cpystrflt,PCB_ONFAULT(%r9) movq $VM_MAXUSER_ADDRESS,%rax /* make sure 'from' is within bounds */ subq %rdi,%rax jbe cpystrflt SMAP_DISABLE \smap /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpq %rdx,%rax jb 8f 1: incq %rdx 2: decq %rdx .if \smap == 0 jz copyinstr_toolong .else jz copyinstr_toolong_smap .endif movb (%rdi),%al movb %al,(%rsi) incq %rsi incq %rdi testb %al,%al jnz 2b SMAP_ENABLE \smap /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax /* set *lencopied and return %eax */ movq %rax,PCB_ONFAULT(%r9) testq %rcx,%rcx jz 3f subq %rdx,%r8 movq %r8,(%rcx) 3: POP_FRAME_POINTER ret ALIGN_TEXT 8: movq %rax,%rdx movq %rax,%r8 jmp 1b .endm ENTRY(copyinstr_nosmap) COPYINSTR smap=0 END(copyinstr_nosmap) ENTRY(copyinstr_smap) COPYINSTR smap=1 END(copyinstr_smap) cpystrflt: /* Fault entry clears PSL.AC */ movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movq $0,PCB_ONFAULT(%r9) testq %rcx,%rcx jz 1f subq %rdx,%r8 movq %r8,(%rcx) 1: POP_FRAME_POINTER ret copyinstr_toolong_smap: clac copyinstr_toolong: /* rdx is zero - return ENAMETOOLONG or EFAULT */ movq $VM_MAXUSER_ADDRESS,%rax cmpq %rax,%rdi jae cpystrflt movl $ENAMETOOLONG,%eax jmp cpystrflt_x /* * copystr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx */ ENTRY(copystr) PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ incq %rdx 1: decq %rdx jz 4f movb (%rdi),%al movb %al,(%rsi) incq %rsi incq %rdi testb %al,%al jnz 1b /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax 2: testq %rcx,%rcx jz 3f /* set *lencopied and return %rax */ subq %rdx,%r8 movq %r8,(%rcx) 3: POP_FRAME_POINTER ret 4: /* rdx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax jmp 2b END(copystr) /* * Handling of special amd64 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ lgdt (%rdi) /* flush the prefetch q */ jmp 1f nop 1: movl $KDSEL,%eax movl %eax,%ds movl %eax,%es movl %eax,%fs /* Beware, use wrmsr to set 64 bit base */ movl %eax,%gs movl %eax,%ss /* reload code selector by turning return into intersegmental return */ popq %rax pushq $KCSEL pushq %rax MEXITCOUNT lretq END(lgdt) /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movq %rbx,0(%rdi) /* save rbx */ movq %rsp,8(%rdi) /* save rsp */ movq %rbp,16(%rdi) /* save rbp */ movq %r12,24(%rdi) /* save r12 */ movq %r13,32(%rdi) /* save r13 */ movq %r14,40(%rdi) /* save r14 */ movq %r15,48(%rdi) /* save r15 */ movq 0(%rsp),%rdx /* get rta */ movq %rdx,56(%rdi) /* save rip */ xorl %eax,%eax /* return(0); */ ret END(setjmp) ENTRY(longjmp) movq 0(%rdi),%rbx /* restore rbx */ movq 8(%rdi),%rsp /* restore rsp */ movq 16(%rdi),%rbp /* restore rbp */ movq 24(%rdi),%r12 /* restore r12 */ movq 32(%rdi),%r13 /* restore r13 */ movq 40(%rdi),%r14 /* restore r14 */ movq 48(%rdi),%r15 /* restore r15 */ movq 56(%rdi),%rdx /* get rta */ movq %rdx,0(%rsp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret END(longjmp) /* * Support for reading MSRs in the safe manner. (Instead of panic on #gp, * return an error.) */ ENTRY(rdmsr_safe) /* int rdmsr_safe(u_int msr, uint64_t *data) */ PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $msr_onfault,PCB_ONFAULT(%r8) movl %edi,%ecx rdmsr /* Read MSR pointed by %ecx. Returns hi byte in edx, lo in %eax */ salq $32,%rdx /* sign-shift %rdx left */ movl %eax,%eax /* zero-extend %eax -> %rax */ orq %rdx,%rax movq %rax,(%rsi) xorq %rax,%rax movq %rax,PCB_ONFAULT(%r8) POP_FRAME_POINTER ret /* * Support for writing MSRs in the safe manner. (Instead of panic on #gp, * return an error.) */ ENTRY(wrmsr_safe) /* int wrmsr_safe(u_int msr, uint64_t data) */ PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $msr_onfault,PCB_ONFAULT(%r8) movl %edi,%ecx movl %esi,%eax sarq $32,%rsi movl %esi,%edx wrmsr /* Write MSR pointed by %ecx. Accepts hi byte in edx, lo in %eax. */ xorq %rax,%rax movq %rax,PCB_ONFAULT(%r8) POP_FRAME_POINTER ret /* * MSR operations fault handler */ ALIGN_TEXT msr_onfault: movq $0,PCB_ONFAULT(%r8) movl $EFAULT,%eax POP_FRAME_POINTER ret /* * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); * Invalidates address space addressed by ucr3, then returns to kcr3. * Done in assembler to ensure no other memory accesses happen while * on ucr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invalidate) pushfq cli movq %rdi,%cr3 /* to user page table */ movq %rsi,%cr3 /* back to kernel */ popfq retq /* * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); * Invalidates virtual address va in address space ucr3, then returns to kcr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invlpg) pushfq cli movq %rdi,%cr3 /* to user page table */ invlpg (%rdx) movq %rsi,%cr3 /* back to kernel */ popfq retq /* * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, * vm_offset_t eva); * Invalidates virtual addresses between sva and eva in address space ucr3, * then returns to kcr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invlrng) pushfq cli movq %rdi,%cr3 /* to user page table */ 1: invlpg (%rdx) addq $PAGE_SIZE,%rdx cmpq %rdx,%rcx ja 1b movq %rsi,%cr3 /* back to kernel */ popfq retq .altmacro .macro ibrs_seq_label l handle_ibrs_\l: .endm .macro ibrs_call_label l call handle_ibrs_\l .endm .macro ibrs_seq count ll=1 .rept \count ibrs_call_label %(ll) nop ibrs_seq_label %(ll) addq $8,%rsp ll=ll+1 .endr .endm /* all callers already saved %rax, %rdx, and %rcx */ ENTRY(handle_ibrs_entry) cmpb $0,hw_ibrs_active(%rip) je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx wrmsr movb $1,PCPU(IBPB_SET) testl $CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip) jne 1f ibrs_seq 32 1: ret END(handle_ibrs_entry) ENTRY(handle_ibrs_exit) cmpb $0,PCPU(IBPB_SET) je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr andl $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax andl $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx wrmsr movb $0,PCPU(IBPB_SET) 1: ret END(handle_ibrs_exit) /* registers-neutral version, but needs stack */ ENTRY(handle_ibrs_exit_rs) cmpb $0,PCPU(IBPB_SET) je 1f pushq %rax pushq %rdx pushq %rcx movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr andl $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax andl $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx wrmsr popq %rcx popq %rdx popq %rax movb $0,PCPU(IBPB_SET) 1: ret END(handle_ibrs_exit_rs) .noaltmacro /* * Flush L1D cache. Load enough of the data from the kernel text * to flush existing L1D content. * * N.B. The function does not follow ABI calling conventions, it corrupts %rbx. * The vmm.ko caller expects that only %rax, %rdx, %rbx, %rcx, %r9, and %rflags * registers are clobbered. The NMI handler caller only needs %r13 preserved. */ ENTRY(flush_l1d_sw) #define L1D_FLUSH_SIZE (64 * 1024) movq $KERNBASE, %r9 movq $-L1D_FLUSH_SIZE, %rcx /* * pass 1: Preload TLB. * Kernel text is mapped using superpages. TLB preload is * done for the benefit of older CPUs which split 2M page * into 4k TLB entries. */ 1: movb L1D_FLUSH_SIZE(%r9, %rcx), %al addq $PAGE_SIZE, %rcx jne 1b xorl %eax, %eax cpuid movq $-L1D_FLUSH_SIZE, %rcx /* pass 2: Read each cache line. */ 2: movb L1D_FLUSH_SIZE(%r9, %rcx), %al addq $64, %rcx jne 2b lfence ret #undef L1D_FLUSH_SIZE END(flush_l1d_sw) ENTRY(flush_l1d_sw_abi) pushq %rbx call flush_l1d_sw popq %rbx ret END(flush_l1d_sw_abi) + +ENTRY(mds_handler_void) + retq +END(mds_handler_void) + +ENTRY(mds_handler_verw) + subq $8, %rsp + movw %ds, (%rsp) + verw (%rsp) + addq $8, %rsp + retq +END(mds_handler_verw) + +ENTRY(mds_handler_ivb) + pushq %rax + pushq %rdx + pushq %rcx + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rdx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + lfence + orpd (%rdx), %xmm0 + orpd (%rdx), %xmm0 + mfence + movl $40, %ecx + addq $16, %rdx +2: movntdq %xmm0, (%rdx) + addq $16, %rdx + decl %ecx + jnz 2b + mfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rcx + popq %rdx + popq %rax + retq +END(mds_handler_ivb) + +ENTRY(mds_handler_bdw) + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdi + pushq %rsi + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rbx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + movq %rbx, %rdi + movq %rbx, %rsi + movl $40, %ecx +2: movntdq %xmm0, (%rbx) + addq $16, %rbx + decl %ecx + jnz 2b + mfence + movl $1536, %ecx + rep; movsb + lfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rsi + popq %rdi + popq %rcx + popq %rbx + popq %rax + retq +END(mds_handler_bdw) + +ENTRY(mds_handler_skl_sse) + pushq %rax + pushq %rdx + pushq %rcx + pushq %rdi + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rdi + movq PCPU(MDS_BUF64), %rdx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + lfence + orpd (%rdx), %xmm0 + orpd (%rdx), %xmm0 + xorl %eax, %eax +2: clflushopt 5376(%rdi, %rax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + movdqa PCPU(MDS_TMP), %xmm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rdi + popq %rcx + popq %rdx + popq %rax + retq +END(mds_handler_skl_sse) + +ENTRY(mds_handler_skl_avx) + pushq %rax + pushq %rdx + pushq %rcx + pushq %rdi + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rdi + movq PCPU(MDS_BUF64), %rdx + vmovdqa %ymm0, PCPU(MDS_TMP) + vpxor %ymm0, %ymm0, %ymm0 + + lfence + vorpd (%rdx), %ymm0, %ymm0 + vorpd (%rdx), %ymm0, %ymm0 + xorl %eax, %eax +2: clflushopt 5376(%rdi, %rax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + vmovdqa PCPU(MDS_TMP), %ymm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rdi + popq %rcx + popq %rdx + popq %rax + retq +END(mds_handler_skl_avx) + +ENTRY(mds_handler_skl_avx512) + pushq %rax + pushq %rdx + pushq %rcx + pushq %rdi + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rdi + movq PCPU(MDS_BUF64), %rdx + vmovdqa64 %zmm0, PCPU(MDS_TMP) + vpxor %zmm0, %zmm0, %zmm0 + + lfence + vorpd (%rdx), %zmm0, %zmm0 + vorpd (%rdx), %zmm0, %zmm0 + xorl %eax, %eax +2: clflushopt 5376(%rdi, %rax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + vmovdqa64 PCPU(MDS_TMP), %zmm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rdi + popq %rcx + popq %rdx + popq %rax + retq +END(mds_handler_skl_avx512) + +ENTRY(mds_handler_silvermont) + pushq %rax + pushq %rdx + pushq %rcx + + movq %cr0, %rax + testb $CR0_TS, %al + je 1f + clts +1: movq PCPU(MDS_BUF), %rdx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + movl $16, %ecx +2: movntdq %xmm0, (%rdx) + addq $16, %rdx + decl %ecx + jnz 2b + mfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movq %rax, %cr0 +3: popq %rcx + popq %rdx + popq %rax + retq +END(mds_handler_silvermont) Index: projects/runtime-coverage-v2/sys/amd64/amd64/trap.c =================================================================== --- projects/runtime-coverage-v2/sys/amd64/amd64/trap.c (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/amd64/trap.c (revision 347599) @@ -1,1205 +1,1205 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include #endif extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg), IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall); void __noinline trap(struct trapframe *frame); void trap_check(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "debug exception", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Control L1D flush on return from NMI. * * Tunable can be set to the following values: * 0 - only enable flush on return from NMI if required by vmm.ko (default) * >1 - always flush on return from NMI. * * Post-boot, the sysctl indicates if flushing is currently enabled. */ int nmi_flush_l1d_sw; SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN, &nmi_flush_l1d_sw, 0, "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { ksiginfo_t ksi; struct thread *td; struct proc *p; register_t addr, dr6; int signo, ucode; u_int type; td = curthread; p = td->td_proc; signo = 0; ucode = 0; addr = 0; dr6 = 0; VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI && ipi_nmi_handler() == 0) return; #endif #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); return; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(frame) != 0) return; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) return; #endif } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ signo = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ enable_intr(); #ifdef KDTRACE_HOOKS if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(frame) == 0) return; #endif signo = SIGTRAP; ucode = TRAP_BRKPT; break; case T_TRCTRAP: /* debug exception */ enable_intr(); signo = SIGTRAP; ucode = TRAP_TRACE; dr6 = rdr6(); if ((dr6 & DBREG_DR6_BS) != 0) { PROC_LOCK(td->td_proc); if ((td->td_dbgflags & TDB_STEP) != 0) { td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; } PROC_UNLOCK(td->td_proc); } break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) return; signo = SIGFPE; break; case T_PROTFLT: /* general protection fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ signo = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: signo = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: signo = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Emulator can take care about this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) return; addr = frame->tf_addr; signo = trap_pfault(frame, TRUE); if (signo == -1) return; if (signo == 0) goto userret; if (signo == SIGSEGV) { ucode = SEGV_MAPERR; } else if (prot_fault_translation == 0) { /* * Autodetect. This check also covers * the images without the ABI-tag ELF * note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { signo = SIGSEGV; ucode = SEGV_ACCERR; } else { signo = SIGBUS; ucode = T_PAGEFLT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ signo = SIGBUS; ucode = T_PAGEFLT; } else { /* * Always SIGSEGV mode. */ signo = SIGSEGV; ucode = SEGV_ACCERR; } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; signo = SIGFPE; break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); return; #endif case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; signo = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; signo = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); return; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; signo = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) return; signo = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); if (dtrace_return_probe_ptr != NULL) dtrace_return_probe_ptr(frame); return; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE); return; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); return; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); return; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. * * In case of PTI, the IRETQ faulted while the * kernel used the pti stack, and exception * frame records %rsp value pointing to that * stack. If we return normally to * doreti_iret_fault, the trapframe is * reconstructed on pti stack, and calltrap() * called on it as well. Due to the very * limited pti stack size, kernel does not * survive for too long. Switch to the normal * thread stack for the trap handling. * * Magic '5' is the number of qwords occupied by * the hardware trap frame. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; if ((PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3) && (frame->tf_rsp == (uintptr_t)PCPU_GET( pti_rsp0) - 5 * sizeof(register_t))) { frame->tf_rsp = PCPU_GET(rsp0) - 5 * sizeof(register_t); } return; } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; return; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; return; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; return; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; return; } if (frame->tf_rip == (long)ld_gsbase) { frame->tf_rip = (long)gsbase_load_fault; return; } if (frame->tf_rip == (long)ld_fsbase) { frame->tf_rip = (long)fsbase_load_fault; return; } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* debug exception */ /* Clear any pending debug events. */ dr6 = rdr6(); load_dr6(0); /* * Ignore debug register exceptions due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap(dr6)) return; /* * Malicious user code can configure a debug * register watchpoint to trap on data access * to the top of stack and then execute 'pop * %ss; int 3'. Due to exception deferral for * 'pop %ss', the CPU will not interrupt 'int * 3' to raise the DB# exception for the debug * register but will postpone the DB# until * execution of the first instruction of the * BP# handler (in kernel mode). Normally the * previous check would ignore DB# exceptions * for watchpoints on user addresses raised in * kernel mode. However, some CPU errata * include cases where DB# exceptions do not * properly set bits in %dr6, e.g. Haswell * HSD23 and Skylake-X SKZ24. * * A deferred DB# can also be raised on the * first instructions of system call entry * points or single-step traps via similar use * of 'pop %ss' or 'mov xxx, %ss'. */ if (pti) { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall_pti) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall_pti) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti)) return; } else { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt)) return; } if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) || /* Needed for AMD. */ frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32)) return; /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, dr6, frame)) return; #endif break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); return; #endif } trap_fatal(frame, 0); return; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap != NULL) signo = (*p->p_sysent->sv_transtrap)(signo, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); userret: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static bool trap_is_smap(struct trapframe *frame) { /* * A page fault on a userspace address is classified as * SMAP-induced if: * - SMAP is supported; * - kernel mode accessed present data page; * - rflags.AC was cleared. * Kernel must never access user space with rflags.AC cleared * if SMAP is enabled. */ return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 && (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) == PGEX_P && (frame->tf_rflags & PSL_AC) == 0); } static bool trap_is_pti(struct trapframe *frame) { return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) == (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)); } static int trap_pfault(struct trapframe *frame, int usermode) { struct thread *td; struct proc *p; vm_map_t map; vm_offset_t va; int rv; vm_prot_t ftype; vm_offset_t eva; td = curthread; p = td->td_proc; eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) return (SIGSEGV); map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. * * If SMAP is enabled, filter SMAP faults also, * because illegal access might occur to the mapped * user address, causing infinite loop. */ if (!usermode && (td->td_intr_nesting_level != 0 || trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * User-mode protection key violation (PKU). May happen * either from usermode or from kernel if copyin accessed * key-protected mapping. */ if ((frame->tf_err & PGEX_PK) != 0) { if (eva > VM_MAXUSER_ADDRESS) { trap_fatal(frame, eva); return (-1); } rv = KERN_PROTECTION_FAILURE; goto after_vmfault; } /* * If nx protection of the usermode portion of kernel page * tables caused trap, panic. */ if (usermode && trap_is_pti(frame)) panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } after_vmfault: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss; u_int type; struct soft_segment_descriptor softseg; char *msg; #ifdef KDB bool handled; #endif code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", - code & PGEX_PK ? " prot key" : " ", - code & PGEX_SGX ? " SGX" : " ", + code & PGEX_PK ? " prot key" : "", + code & PGEX_SGX ? " SGX" : "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(type, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n" "rip %#lx rsp %#lx rbp %#lx\n" "rax %#lx rdx %#lx rbx %#lx\n" "rcx %#lx rsi %#lx rdi %#lx\n" "r8 %#lx r9 %#lx r10 %#lx\n" "r11 %#lx r12 %#lx r13 %#lx\n" "r14 %#lx r15 %#lx rflags %#lx\n" "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n" "fsbase %#lx gsbase %#lx kgsbase %#lx\n", frame->tf_rip, frame->tf_rsp, frame->tf_rbp, frame->tf_rax, frame->tf_rdx, frame->tf_rbx, frame->tf_rcx, frame->tf_rdi, frame->tf_rsi, frame->tf_r8, frame->tf_r9, frame->tf_r10, frame->tf_r11, frame->tf_r12, frame->tf_r13, frame->tf_r14, frame->tf_r15, frame->tf_rflags, frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, frame->tf_fs, frame->tf_gs, rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } static int __noinline cpu_fetch_syscall_args_fallback(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; register_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = NARGREGS; sa->code = frame->tf_rax; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!")); argp = &frame->tf_rdi; argp += reg; memcpy(sa->args, argp, sizeof(sa->args[0]) * NARGREGS); if (sa->narg > regcnt) { params = (caddr_t)frame->tf_rsp + sizeof(register_t); error = copyin(params, &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); if (__predict_false(error != 0)) return (error); } td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_rax; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall || sa->code >= p->p_sysent->sv_size)) return (cpu_fetch_syscall_args_fallback(td, sa)); sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!")); if (__predict_false(sa->narg > NARGREGS)) return (cpu_fetch_syscall_args_fallback(td, sa)); memcpy(sa->args, &frame->tf_rdi, sizeof(sa->args[0]) * NARGREGS); td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } #include "../../kern/subr_syscall.c" static void (*syscall_ret_l1d_flush)(void); int syscall_ret_l1d_flush_mode; static void flush_l1d_hw(void) { wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); } static void __inline amd64_syscall_ret_flush_l1d_inline(int error) { void (*p)(void); if (error != 0 && error != EEXIST && error != EAGAIN && error != EXDEV && error != ENOENT && error != ENOTCONN && error != EINPROGRESS) { p = syscall_ret_l1d_flush; if (p != NULL) p(); } } void amd64_syscall_ret_flush_l1d(int error) { amd64_syscall_ret_flush_l1d_inline(error); } void amd64_syscall_ret_flush_l1d_recalc(void) { bool l1d_hw; l1d_hw = (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0; again: switch (syscall_ret_l1d_flush_mode) { case 0: syscall_ret_l1d_flush = NULL; break; case 1: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : flush_l1d_sw_abi; break; case 2: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : NULL; break; case 3: syscall_ret_l1d_flush = flush_l1d_sw_abi; break; default: syscall_ret_l1d_flush_mode = 1; goto again; } } static int machdep_syscall_ret_flush_l1d(SYSCTL_HANDLER_ARGS) { int error, val; val = syscall_ret_l1d_flush_mode; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); syscall_ret_l1d_flush_mode = val; amd64_syscall_ret_flush_l1d_recalc(); return (0); } SYSCTL_PROC(_machdep, OID_AUTO, syscall_ret_flush_l1d, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, machdep_syscall_ret_flush_l1d, "I", "Flush L1D on syscall return with error (0 - off, 1 - on, " "2 - use hw only, 3 - use sw only"); /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!TRAPF_USERMODE(td->td_frame)) { panic("syscall"); /* NOT REACHED */ } #endif error = syscallenter(td); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_md.md_invl_gen.gen == 0, ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, td->td_sa.code), td->td_md.md_invl_gen.gen)); syscallret(td, error); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (__predict_false(td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); amd64_syscall_ret_flush_l1d_inline(error); } Index: projects/runtime-coverage-v2/sys/amd64/include/pcpu.h =================================================================== --- projects/runtime-coverage-v2/sys/amd64/include/pcpu.h (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/include/pcpu.h (revision 347599) @@ -1,271 +1,275 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifndef _SYS_CDEFS_H_ #error "sys/cdefs.h is a prerequisite for this file" #endif #define PC_PTI_STACK_SZ 16 struct monitorbuf { int idle_state; /* Used by cpu_idle_mwait. */ int stop_state; /* Used by cpustop_handler. */ char padding[128 - (2 * sizeof(int))]; }; _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ #define PCPU_MD_FIELDS \ struct monitorbuf pc_monitorbuf __aligned(128); /* cache line */\ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \ uint64_t pc_kcr3; \ uint64_t pc_ucr3; \ uint64_t pc_saved_ucr3; \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ register_t pc_scratch_rax; \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ /* Pointer to the CPU %fs descriptor */ \ struct user_segment_descriptor *pc_fs32p; \ /* Pointer to the CPU %gs descriptor */ \ struct user_segment_descriptor *pc_gs32p; \ /* Pointer to the CPU LDT descriptor */ \ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ uint64_t pc_pm_save_cnt; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \ register_t pc_pti_rsp0; \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ uint32_t pc_ibpb_set; \ + void *pc_mds_buf; \ + void *pc_mds_buf64; \ + uint32_t pc_pad[2]; \ + uint8_t pc_mds_tmp[64]; \ u_int pc_ipi_bitmap; \ - char __pad[3284] /* pad to UMA_PCPU_ALLOC_SIZE */ + char __pad[3172] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 #ifdef _KERNEL #define MONITOR_STOPSTATE_RUNNING 0 #define MONITOR_STOPSTATE_STOPPED 1 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* * Evaluates to the byte offset of the per-cpu variable name. */ #define __pcpu_offset(name) \ __offsetof(struct pcpu, name) /* * Evaluates to the type of the per-cpu variable name. */ #define __pcpu_type(name) \ __typeof(((struct pcpu *)0)->name) /* * Evaluates to the address of the per-cpu variable name. */ #define __PCPU_PTR(name) __extension__ ({ \ __pcpu_type(name) *__p; \ \ __asm __volatile("movq %%gs:%1,%0; addq %2,%0" \ : "=r" (__p) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ "i" (__pcpu_offset(name))); \ \ __p; \ }) /* * Evaluates to the value of the per-cpu variable name. */ #define __PCPU_GET(name) __extension__ ({ \ __pcpu_type(name) __res; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \ sizeof(__res) == 4 || sizeof(__res) == 8) { \ __asm __volatile("mov %%gs:%1,%0" \ : "=r" (__s) \ : "m" (*(struct __s *)(__pcpu_offset(name)))); \ *(struct __s *)(void *)&__res = __s; \ } else { \ __res = *__PCPU_PTR(name); \ } \ __res; \ }) /* * Adds the value to the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_ADD(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("add %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else \ *__PCPU_PTR(name) += __val; \ } while (0) /* * Increments the value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_INC(name) do { \ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \ sizeof(__pcpu_type(name)) == 2 || \ sizeof(__pcpu_type(name)) == 4 || \ sizeof(__pcpu_type(name)) == 8); \ if (sizeof(__pcpu_type(name)) == 1) { \ __asm __volatile("incb %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 2) { \ __asm __volatile("incw %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 4) { \ __asm __volatile("incl %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 8) { \ __asm __volatile("incq %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } \ } while (0) /* * Sets the value of the per-cpu variable name to value val. */ #define __PCPU_SET(name, val) { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("mov %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else { \ *__PCPU_PTR(name) = __val; \ } \ } #define get_pcpu() __extension__ ({ \ struct pcpu *__pc; \ \ __asm __volatile("movq %%gs:%1,%0" \ : "=r" (__pc) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace)))); \ __pc; \ }) #define PCPU_GET(member) __PCPU_GET(pc_ ## member) #define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val) #define PCPU_INC(member) __PCPU_INC(pc_ ## member) #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) #define OFFSETOF_CURTHREAD 0 #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnull-dereference" #endif static __inline __pure2 struct thread * __curthread(void) { struct thread *td; __asm("movq %%gs:%P1,%0" : "=r" (td) : "n" (OFFSETOF_CURTHREAD)); return (td); } #ifdef __clang__ #pragma clang diagnostic pop #endif #define curthread (__curthread()) #define OFFSETOF_CURPCB 32 static __inline __pure2 struct pcb * __curpcb(void) { struct pcb *pcb; __asm("movq %%gs:%P1,%0" : "=r" (pcb) : "n" (OFFSETOF_CURPCB)); return (pcb); } #define curpcb (__curpcb()) #define IS_BSP() (PCPU_GET(cpuid) == 0) #else /* !__GNUCLIKE_ASM || !__GNUCLIKE___TYPEOF */ #error "this file needs to be ported to your compiler" #endif /* __GNUCLIKE_ASM && __GNUCLIKE___TYPEOF */ #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c =================================================================== --- projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c (revision 347598) +++ projects/runtime-coverage-v2/sys/amd64/linux/linux_ptrace.c (revision 347599) @@ -1,541 +1,556 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Edward Tomasz Napierala * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #define LINUX_PTRACE_TRACEME 0 #define LINUX_PTRACE_PEEKTEXT 1 #define LINUX_PTRACE_PEEKDATA 2 #define LINUX_PTRACE_PEEKUSER 3 #define LINUX_PTRACE_POKETEXT 4 #define LINUX_PTRACE_POKEDATA 5 #define LINUX_PTRACE_POKEUSER 6 #define LINUX_PTRACE_CONT 7 #define LINUX_PTRACE_KILL 8 #define LINUX_PTRACE_SINGLESTEP 9 #define LINUX_PTRACE_GETREGS 12 #define LINUX_PTRACE_SETREGS 13 #define LINUX_PTRACE_GETFPREGS 14 #define LINUX_PTRACE_SETFPREGS 15 #define LINUX_PTRACE_ATTACH 16 #define LINUX_PTRACE_DETACH 17 #define LINUX_PTRACE_SYSCALL 24 #define LINUX_PTRACE_SETOPTIONS 0x4200 #define LINUX_PTRACE_GETREGSET 0x4204 #define LINUX_PTRACE_SEIZE 0x4206 #define LINUX_PTRACE_O_TRACESYSGOOD 1 #define LINUX_PTRACE_O_TRACEFORK 2 #define LINUX_PTRACE_O_TRACEVFORK 4 #define LINUX_PTRACE_O_TRACECLONE 8 #define LINUX_PTRACE_O_TRACEEXEC 16 #define LINUX_PTRACE_O_TRACEVFORKDONE 32 #define LINUX_PTRACE_O_TRACEEXIT 64 #define LINUX_PTRACE_O_TRACESECCOMP 128 #define LINUX_PTRACE_O_EXITKILL 1048576 #define LINUX_PTRACE_O_SUSPEND_SECCOMP 2097152 #define LINUX_NT_PRSTATUS 1 #define LINUX_PTRACE_O_MASK (LINUX_PTRACE_O_TRACESYSGOOD | \ LINUX_PTRACE_O_TRACEFORK | LINUX_PTRACE_O_TRACEVFORK | \ LINUX_PTRACE_O_TRACECLONE | LINUX_PTRACE_O_TRACEEXEC | \ LINUX_PTRACE_O_TRACEVFORKDONE | LINUX_PTRACE_O_TRACEEXIT | \ LINUX_PTRACE_O_TRACESECCOMP | LINUX_PTRACE_O_EXITKILL | \ LINUX_PTRACE_O_SUSPEND_SECCOMP) static int map_signum(int lsig, int *bsigp) { int bsig; if (lsig == 0) { *bsigp = 0; return (0); } if (lsig < 0 || lsig > LINUX_SIGRTMAX) return (EINVAL); bsig = linux_to_bsd_signal(lsig); if (bsig == SIGSTOP) bsig = 0; *bsigp = bsig; return (0); } struct linux_pt_reg { l_ulong r15; l_ulong r14; l_ulong r13; l_ulong r12; l_ulong rbp; l_ulong rbx; l_ulong r11; l_ulong r10; l_ulong r9; l_ulong r8; l_ulong rax; l_ulong rcx; l_ulong rdx; l_ulong rsi; l_ulong rdi; l_ulong orig_rax; l_ulong rip; l_ulong cs; l_ulong eflags; l_ulong rsp; l_ulong ss; }; struct linux_pt_regset { l_ulong r15; l_ulong r14; l_ulong r13; l_ulong r12; l_ulong rbp; l_ulong rbx; l_ulong r11; l_ulong r10; l_ulong r9; l_ulong r8; l_ulong rax; l_ulong rcx; l_ulong rdx; l_ulong rsi; l_ulong rdi; l_ulong orig_rax; l_ulong rip; l_ulong cs; l_ulong eflags; l_ulong rsp; l_ulong ss; l_ulong fs_base; l_ulong gs_base; l_ulong ds; l_ulong es; l_ulong fs; l_ulong gs; }; /* * Translate amd64 ptrace registers between Linux and FreeBSD formats. * The translation is pretty straighforward, for all registers but * orig_rax on Linux side and r_trapno and r_err in FreeBSD. */ static void map_regs_to_linux(struct reg *b_reg, struct linux_pt_reg *l_reg) { l_reg->r15 = b_reg->r_r15; l_reg->r14 = b_reg->r_r14; l_reg->r13 = b_reg->r_r13; l_reg->r12 = b_reg->r_r12; l_reg->rbp = b_reg->r_rbp; l_reg->rbx = b_reg->r_rbx; l_reg->r11 = b_reg->r_r11; l_reg->r10 = b_reg->r_r10; l_reg->r9 = b_reg->r_r9; l_reg->r8 = b_reg->r_r8; l_reg->rax = b_reg->r_rax; l_reg->rcx = b_reg->r_rcx; l_reg->rdx = b_reg->r_rdx; l_reg->rsi = b_reg->r_rsi; l_reg->rdi = b_reg->r_rdi; l_reg->orig_rax = b_reg->r_rax; l_reg->rip = b_reg->r_rip; l_reg->cs = b_reg->r_cs; l_reg->eflags = b_reg->r_rflags; l_reg->rsp = b_reg->r_rsp; l_reg->ss = b_reg->r_ss; } static void map_regs_to_linux_regset(struct reg *b_reg, unsigned long fs_base, unsigned long gs_base, struct linux_pt_regset *l_regset) { l_regset->r15 = b_reg->r_r15; l_regset->r14 = b_reg->r_r14; l_regset->r13 = b_reg->r_r13; l_regset->r12 = b_reg->r_r12; l_regset->rbp = b_reg->r_rbp; l_regset->rbx = b_reg->r_rbx; l_regset->r11 = b_reg->r_r11; l_regset->r10 = b_reg->r_r10; l_regset->r9 = b_reg->r_r9; l_regset->r8 = b_reg->r_r8; l_regset->rax = b_reg->r_rax; l_regset->rcx = b_reg->r_rcx; l_regset->rdx = b_reg->r_rdx; l_regset->rsi = b_reg->r_rsi; l_regset->rdi = b_reg->r_rdi; l_regset->orig_rax = b_reg->r_rax; l_regset->rip = b_reg->r_rip; l_regset->cs = b_reg->r_cs; l_regset->eflags = b_reg->r_rflags; l_regset->rsp = b_reg->r_rsp; l_regset->ss = b_reg->r_ss; l_regset->fs_base = fs_base; l_regset->gs_base = gs_base; l_regset->ds = b_reg->r_ds; l_regset->es = b_reg->r_es; l_regset->fs = b_reg->r_fs; l_regset->gs = b_reg->r_gs; } static void map_regs_from_linux(struct reg *b_reg, struct linux_pt_reg *l_reg) { b_reg->r_r15 = l_reg->r15; b_reg->r_r14 = l_reg->r14; b_reg->r_r13 = l_reg->r13; b_reg->r_r12 = l_reg->r12; b_reg->r_r11 = l_reg->r11; b_reg->r_r10 = l_reg->r10; b_reg->r_r9 = l_reg->r9; b_reg->r_r8 = l_reg->r8; b_reg->r_rdi = l_reg->rdi; b_reg->r_rsi = l_reg->rsi; b_reg->r_rbp = l_reg->rbp; b_reg->r_rbx = l_reg->rbx; b_reg->r_rdx = l_reg->rdx; b_reg->r_rcx = l_reg->rcx; b_reg->r_rax = l_reg->rax; /* * XXX: Are zeroes the right thing to put here? */ b_reg->r_trapno = 0; b_reg->r_fs = 0; b_reg->r_gs = 0; b_reg->r_err = 0; b_reg->r_es = 0; b_reg->r_ds = 0; b_reg->r_rip = l_reg->rip; b_reg->r_cs = l_reg->cs; b_reg->r_rflags = l_reg->eflags; b_reg->r_rsp = l_reg->rsp; b_reg->r_ss = l_reg->ss; } static int linux_ptrace_peek(struct thread *td, pid_t pid, void *addr, void *data) { int error; error = kern_ptrace(td, PT_READ_I, pid, addr, 0); if (error == 0) error = copyout(td->td_retval, data, sizeof(l_int)); td->td_retval[0] = error; return (error); } static int linux_ptrace_setoptions(struct thread *td, pid_t pid, l_ulong data) { int mask; mask = 0; if (data & ~LINUX_PTRACE_O_MASK) { printf("%s: unknown ptrace option %lx set; " "returning EINVAL\n", __func__, data & ~LINUX_PTRACE_O_MASK); return (EINVAL); } /* * PTRACE_O_EXITKILL is ignored, we do that by default. */ if (data & LINUX_PTRACE_O_TRACESYSGOOD) { printf("%s: PTRACE_O_TRACESYSGOOD not implemented; " "returning EINVAL\n", __func__); return (EINVAL); } if (data & LINUX_PTRACE_O_TRACEFORK) mask |= PTRACE_FORK; if (data & LINUX_PTRACE_O_TRACEVFORK) mask |= PTRACE_VFORK; if (data & LINUX_PTRACE_O_TRACECLONE) mask |= PTRACE_VFORK; if (data & LINUX_PTRACE_O_TRACEEXEC) mask |= PTRACE_EXEC; if (data & LINUX_PTRACE_O_TRACEVFORKDONE) mask |= PTRACE_VFORK; /* XXX: Close enough? */ if (data & LINUX_PTRACE_O_TRACEEXIT) { printf("%s: PTRACE_O_TRACEEXIT not implemented; " "returning EINVAL\n", __func__); return (EINVAL); } return (kern_ptrace(td, PT_SET_EVENT_MASK, pid, &mask, sizeof(mask))); } static int linux_ptrace_getregs(struct thread *td, pid_t pid, void *data) { struct ptrace_lwpinfo lwpinfo; struct reg b_reg; struct linux_pt_reg l_reg; int error; error = kern_ptrace(td, PT_GETREGS, pid, &b_reg, 0); if (error != 0) return (error); map_regs_to_linux(&b_reg, &l_reg); - /* - * The strace(1) utility depends on RAX being set to -ENOSYS - * on syscall entry. - */ error = kern_ptrace(td, PT_LWPINFO, pid, &lwpinfo, sizeof(lwpinfo)); if (error != 0) { printf("%s: PT_LWPINFO failed with error %d\n", __func__, error); return (error); } - if (lwpinfo.pl_flags & PL_FLAG_SCE) - l_reg.rax = -38; // XXX: Don't hardcode? + if (lwpinfo.pl_flags & PL_FLAG_SCE) { + /* + * The strace(1) utility depends on RAX being set to -ENOSYS + * on syscall entry; otherwise it loops printing those: + * + * [ Process PID=928 runs in 64 bit mode. ] + * [ Process PID=928 runs in x32 mode. ] + */ + l_reg.rax = -38; /* -ENOSYS */ + /* + * Undo the mangling done in exception.S:fast_syscall_common(). + */ + l_reg.r10 = l_reg.rcx; + } + error = copyout(&l_reg, (void *)data, sizeof(l_reg)); return (error); } static int linux_ptrace_setregs(struct thread *td, pid_t pid, void *data) { struct reg b_reg; struct linux_pt_reg l_reg; int error; error = copyin(data, &l_reg, sizeof(l_reg)); if (error != 0) return (error); map_regs_from_linux(&b_reg, &l_reg); error = kern_ptrace(td, PT_SETREGS, pid, &b_reg, 0); return (error); } static int linux_ptrace_getregset_prstatus(struct thread *td, pid_t pid, l_ulong data) { struct ptrace_lwpinfo lwpinfo; struct reg b_reg; struct linux_pt_regset l_regset; struct iovec iov; struct pcb *pcb; unsigned long fsbase, gsbase; size_t len; int error; error = copyin((const void *)data, &iov, sizeof(iov)); if (error != 0) { printf("%s: copyin error %d\n", __func__, error); return (error); } error = kern_ptrace(td, PT_GETREGS, pid, &b_reg, 0); if (error != 0) return (error); pcb = td->td_pcb; if (td == curthread) update_pcb_bases(pcb); fsbase = pcb->pcb_fsbase; gsbase = pcb->pcb_gsbase; map_regs_to_linux_regset(&b_reg, fsbase, gsbase, &l_regset); - /* - * The strace(1) utility depends on RAX being set to -ENOSYS - * on syscall entry; otherwise it loops printing those: - * - * [ Process PID=928 runs in 64 bit mode. ] - * [ Process PID=928 runs in x32 mode. ] - */ error = kern_ptrace(td, PT_LWPINFO, pid, &lwpinfo, sizeof(lwpinfo)); if (error != 0) { printf("%s: PT_LWPINFO failed with error %d\n", __func__, error); return (error); } - if (lwpinfo.pl_flags & PL_FLAG_SCE) - l_regset.rax = -38; // XXX: Don't hardcode? + if (lwpinfo.pl_flags & PL_FLAG_SCE) { + /* + * The strace(1) utility depends on RAX being set to -ENOSYS + * on syscall entry; otherwise it loops printing those: + * + * [ Process PID=928 runs in 64 bit mode. ] + * [ Process PID=928 runs in x32 mode. ] + */ + l_regset.rax = -38; /* -ENOSYS */ + + /* + * Undo the mangling done in exception.S:fast_syscall_common(). + */ + l_regset.r10 = l_regset.rcx; + } len = MIN(iov.iov_len, sizeof(l_regset)); error = copyout(&l_regset, (void *)iov.iov_base, len); if (error != 0) { printf("%s: copyout error %d\n", __func__, error); return (error); } iov.iov_len -= len; error = copyout(&iov, (void *)data, sizeof(iov)); if (error != 0) { printf("%s: iov copyout error %d\n", __func__, error); return (error); } return (error); } static int linux_ptrace_getregset(struct thread *td, pid_t pid, l_ulong addr, l_ulong data) { switch (addr) { case LINUX_NT_PRSTATUS: return (linux_ptrace_getregset_prstatus(td, pid, data)); default: printf("%s: PTRACE_GETREGSET request %ld not implemented; " "returning EINVAL\n", __func__, addr); return (EINVAL); } } static int linux_ptrace_seize(struct thread *td, pid_t pid, l_ulong addr, l_ulong data) { printf("%s: PTRACE_SEIZE not implemented; returning EINVAL\n", __func__); return (EINVAL); } int linux_ptrace(struct thread *td, struct linux_ptrace_args *uap) { void *addr; pid_t pid; int error, sig; pid = (pid_t)uap->pid; addr = (void *)uap->addr; switch (uap->req) { case LINUX_PTRACE_TRACEME: error = kern_ptrace(td, PT_TRACE_ME, 0, 0, 0); break; case LINUX_PTRACE_PEEKTEXT: case LINUX_PTRACE_PEEKDATA: error = linux_ptrace_peek(td, pid, addr, (void *)uap->data); if (error != 0) return (error); /* * Linux expects this syscall to read 64 bits, not 32. */ error = linux_ptrace_peek(td, pid, (void *)(uap->addr + 4), (void *)(uap->data + 4)); break; case LINUX_PTRACE_POKETEXT: error = kern_ptrace(td, PT_WRITE_I, pid, addr, uap->data); break; case LINUX_PTRACE_POKEDATA: error = kern_ptrace(td, PT_WRITE_D, pid, addr, uap->data); break; case LINUX_PTRACE_CONT: error = map_signum(uap->data, &sig); if (error != 0) break; error = kern_ptrace(td, PT_CONTINUE, pid, (void *)1, sig); break; case LINUX_PTRACE_KILL: error = kern_ptrace(td, PT_KILL, pid, addr, uap->data); break; case LINUX_PTRACE_SINGLESTEP: error = map_signum(uap->data, &sig); if (error != 0) break; error = kern_ptrace(td, PT_STEP, pid, (void *)1, sig); break; case LINUX_PTRACE_GETREGS: error = linux_ptrace_getregs(td, pid, (void *)uap->data); break; case LINUX_PTRACE_SETREGS: error = linux_ptrace_setregs(td, pid, (void *)uap->data); break; case LINUX_PTRACE_ATTACH: error = kern_ptrace(td, PT_ATTACH, pid, addr, uap->data); break; case LINUX_PTRACE_DETACH: error = map_signum(uap->data, &sig); if (error != 0) break; error = kern_ptrace(td, PT_DETACH, pid, (void *)1, sig); break; case LINUX_PTRACE_SYSCALL: error = map_signum(uap->data, &sig); if (error != 0) break; error = kern_ptrace(td, PT_SYSCALL, pid, (void *)1, sig); break; case LINUX_PTRACE_SETOPTIONS: error = linux_ptrace_setoptions(td, pid, uap->data); break; case LINUX_PTRACE_GETREGSET: error = linux_ptrace_getregset(td, pid, uap->addr, uap->data); break; case LINUX_PTRACE_SEIZE: error = linux_ptrace_seize(td, pid, uap->addr, uap->data); break; default: printf("%s: ptrace(%ld, ...) not implemented; returning EINVAL\n", __func__, uap->req); error = EINVAL; break; } return (error); } Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h =================================================================== --- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h (revision 347598) +++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/timer.h (revision 347599) @@ -1,94 +1,94 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_TIMER_H_ #define _LINUX_TIMER_H_ #include #include #include #include struct timer_list { struct callout callout; union { void (*function) (unsigned long); /* < v4.15 */ void (*function_415) (struct timer_list *); }; unsigned long data; int expires; }; extern unsigned long linux_timer_hz_mask; #define TIMER_IRQSAFE 0x0001 #define from_timer(var, arg, field) \ container_of(arg, typeof(*(var)), field) #define timer_setup(timer, func, flags) do { \ CTASSERT(((flags) & ~TIMER_IRQSAFE) == 0); \ (timer)->function_415 = (func); \ (timer)->data = (unsigned long)(timer); \ callout_init(&(timer)->callout, 1); \ } while (0) #define setup_timer(timer, func, dat) do { \ (timer)->function = (func); \ (timer)->data = (dat); \ callout_init(&(timer)->callout, 1); \ } while (0) #define __setup_timer(timer, func, dat, flags) do { \ CTASSERT(((flags) & ~TIMER_IRQSAFE) == 0); \ setup_timer(timer, func, dat); \ } while (0) #define init_timer(timer) do { \ (timer)->function = NULL; \ (timer)->data = 0; \ callout_init(&(timer)->callout, 1); \ } while (0) extern void mod_timer(struct timer_list *, int); extern void add_timer(struct timer_list *); extern void add_timer_on(struct timer_list *, int cpu); +extern int del_timer(struct timer_list *); -#define del_timer(timer) (void)callout_stop(&(timer)->callout) #define del_timer_sync(timer) (void)callout_drain(&(timer)->callout) #define timer_pending(timer) callout_pending(&(timer)->callout) #define round_jiffies(j) \ ((int)(((j) + linux_timer_hz_mask) & ~linux_timer_hz_mask)) #define round_jiffies_relative(j) round_jiffies(j) #define round_jiffies_up(j) round_jiffies(j) #define round_jiffies_up_relative(j) round_jiffies_up(j) #endif /* _LINUX_TIMER_H_ */ Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h =================================================================== --- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h (revision 347598) +++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/include/linux/ww_mutex.h (revision 347599) @@ -1,142 +1,143 @@ /*- * Copyright (c) 2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_WW_MUTEX_H_ #define _LINUX_WW_MUTEX_H_ #include #include #include #include #include struct ww_class { const char *mutex_name; }; struct ww_acquire_ctx { }; struct ww_mutex { struct mutex base; struct cv condvar; + struct ww_acquire_ctx *ctx; }; #define DEFINE_WW_CLASS(name) \ struct ww_class name = { \ .mutex_name = mutex_name(#name "_mutex") \ } #define DEFINE_WW_MUTEX(name, ww_class) \ struct ww_mutex name; \ static void name##_init(void *arg) \ { \ ww_mutex_init(&name, &ww_class); \ } \ SYSINIT(name, SI_SUB_LOCK, SI_ORDER_SECOND, name##_init, NULL) #define ww_mutex_is_locked(_m) \ sx_xlocked(&(_m)->base.sx) #define ww_mutex_lock_slow(_m, _x) \ ww_mutex_lock(_m, _x) #define ww_mutex_lock_slow_interruptible(_m, _x) \ ww_mutex_lock_interruptible(_m, _x) static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock) { return (mutex_trylock(&lock->base)); } extern int linux_ww_mutex_lock_sub(struct ww_mutex *, int catch_signal); static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { if (MUTEX_SKIP()) return (0); else if ((struct thread *)SX_OWNER(lock->base.sx.sx_lock) == curthread) return (-EALREADY); else return (linux_ww_mutex_lock_sub(lock, 0)); } static inline int ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { if (MUTEX_SKIP()) return (0); else if ((struct thread *)SX_OWNER(lock->base.sx.sx_lock) == curthread) return (-EALREADY); else return (linux_ww_mutex_lock_sub(lock, 1)); } extern void linux_ww_mutex_unlock_sub(struct ww_mutex *); static inline void ww_mutex_unlock(struct ww_mutex *lock) { if (MUTEX_SKIP()) return; else linux_ww_mutex_unlock_sub(lock); } static inline void ww_mutex_destroy(struct ww_mutex *lock) { cv_destroy(&lock->condvar); mutex_destroy(&lock->base); } static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, struct ww_class *ww_class) { } static inline void ww_mutex_init(struct ww_mutex *lock, struct ww_class *ww_class) { linux_mutex_init(&lock->base, ww_class->mutex_name, SX_NOWITNESS); cv_init(&lock->condvar, "lkpi-ww"); } static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) { } static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) { } #endif /* _LINUX_WW_MUTEX_H_ */ Index: projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c =================================================================== --- projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c (revision 347598) +++ projects/runtime-coverage-v2/sys/compat/linuxkpi/common/src/linux_compat.c (revision 347599) @@ -1,2450 +1,2459 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2018 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters"); int linuxkpi_debug; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat"); #include /* Undo Linux compat changes. */ #undef RB_ROOT #undef file #undef cdev #define RB_ROOT(head) (head)->rbh_root static void linux_cdev_deref(struct linux_cdev *ldev); static struct vm_area_struct *linux_cdev_handle_find(void *handle); struct kobject linux_class_root; struct device linux_root_device; struct class linux_class_misc; struct list_head pci_drivers; struct list_head pci_devices; spinlock_t pci_lock; unsigned long linux_timer_hz_mask; int panic_cmp(struct rb_node *one, struct rb_node *two) { panic("no cmp"); } RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args) { va_list tmp_va; int len; char *old; char *name; char dummy; old = kobj->name; if (old && fmt == NULL) return (0); /* compute length of string */ va_copy(tmp_va, args); len = vsnprintf(&dummy, 0, fmt, tmp_va); va_end(tmp_va); /* account for zero termination */ len++; /* check for error */ if (len < 1) return (-EINVAL); /* allocate memory for string */ name = kzalloc(len, GFP_KERNEL); if (name == NULL) return (-ENOMEM); vsnprintf(name, len, fmt, args); kobj->name = name; /* free old string */ kfree(old); /* filter new string */ for (; *name != '\0'; name++) if (*name == '/') *name = '!'; return (0); } int kobject_set_name(struct kobject *kobj, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); return (error); } static int kobject_add_complete(struct kobject *kobj, struct kobject *parent) { const struct kobj_type *t; int error; kobj->parent = parent; error = sysfs_create_dir(kobj); if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) { struct attribute **attr; t = kobj->ktype; for (attr = t->default_attrs; *attr != NULL; attr++) { error = sysfs_create_file(kobj, *attr); if (error) break; } if (error) sysfs_remove_dir(kobj); } return (error); } int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } void linux_kobject_release(struct kref *kref) { struct kobject *kobj; char *name; kobj = container_of(kref, struct kobject, kref); sysfs_remove_dir(kobj); name = kobj->name; if (kobj->ktype && kobj->ktype->release) kobj->ktype->release(kobj); kfree(name); } static void linux_kobject_kfree(struct kobject *kobj) { kfree(kobj); } static void linux_kobject_kfree_name(struct kobject *kobj) { if (kobj) { kfree(kobj->name); } } const struct kobj_type linux_kfree_type = { .release = linux_kobject_kfree }; static void linux_device_release(struct device *dev) { pr_debug("linux_device_release: %s\n", dev_name(dev)); kfree(dev); } static ssize_t linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct class, kobj), dattr, buf); return (error); } static ssize_t linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct class, kobj), dattr, buf, count); return (error); } static void linux_class_release(struct kobject *kobj) { struct class *class; class = container_of(kobj, struct class, kobj); if (class->class_release) class->class_release(class); } static const struct sysfs_ops linux_class_sysfs = { .show = linux_class_show, .store = linux_class_store, }; const struct kobj_type linux_class_ktype = { .release = linux_class_release, .sysfs_ops = &linux_class_sysfs }; static void linux_dev_release(struct kobject *kobj) { struct device *dev; dev = container_of(kobj, struct device, kobj); /* This is the precedence defined by linux. */ if (dev->release) dev->release(dev); else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); } static ssize_t linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct device, kobj), dattr, buf); return (error); } static ssize_t linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct device, kobj), dattr, buf, count); return (error); } static const struct sysfs_ops linux_dev_sysfs = { .show = linux_dev_show, .store = linux_dev_store, }; const struct kobj_type linux_dev_ktype = { .release = linux_dev_release, .sysfs_ops = &linux_dev_sysfs }; struct device * device_create(struct class *class, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...) { struct device *dev; va_list args; dev = kzalloc(sizeof(*dev), M_WAITOK); dev->parent = parent; dev->class = class; dev->devt = devt; dev->driver_data = drvdata; dev->release = linux_device_release; va_start(args, fmt); kobject_set_name_vargs(&dev->kobj, fmt, args); va_end(args); device_register(dev); return (dev); } int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; int error; kobject_init(kobj, ktype); kobj->ktype = ktype; kobj->parent = parent; kobj->name = NULL; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } static void linux_kq_lock(void *arg) { spinlock_t *s = arg; spin_lock(s); } static void linux_kq_unlock(void *arg) { spinlock_t *s = arg; spin_unlock(s); } static void linux_kq_lock_owned(void *arg) { #ifdef INVARIANTS spinlock_t *s = arg; mtx_assert(&s->m, MA_OWNED); #endif } static void linux_kq_lock_unowned(void *arg) { #ifdef INVARIANTS spinlock_t *s = arg; mtx_assert(&s->m, MA_NOTOWNED); #endif } static void linux_file_kqfilter_poll(struct linux_file *, int); struct linux_file * linux_file_alloc(void) { struct linux_file *filp; filp = kzalloc(sizeof(*filp), GFP_KERNEL); /* set initial refcount */ filp->f_count = 1; /* setup fields needed by kqueue support */ spin_lock_init(&filp->f_kqlock); knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, linux_kq_lock, linux_kq_unlock, linux_kq_lock_owned, linux_kq_lock_unowned); return (filp); } void linux_file_free(struct linux_file *filp) { if (filp->_file == NULL) { if (filp->f_shmem != NULL) vm_object_deallocate(filp->f_shmem); kfree(filp); } else { /* * The close method of the character device or file * will free the linux_file structure: */ _fdrop(filp->_file, curthread); } } static int linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; vm_page_t page; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake * page, update it with the new physical * address. */ page = *mres; vm_page_updatefake(page, paddr, vm_obj->memattr); } else { /* * Replace the passed in "mres" page with our * own fake page and free up the all of the * original pages. */ VM_OBJECT_WUNLOCK(vm_obj); page = vm_page_getfake(paddr, vm_obj->memattr); VM_OBJECT_WLOCK(vm_obj); vm_page_replace_checked(page, vm_obj, (*mres)->pindex, *mres); vm_page_lock(*mres); vm_page_free(*mres); vm_page_unlock(*mres); *mres = page; } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); } return (VM_PAGER_FAIL); } static int linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { struct vm_area_struct *vmap; int err; linux_set_current(curthread); /* get VM area structure */ vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); VM_OBJECT_WUNLOCK(vm_obj); down_write(&vmap->vm_mm->mmap_sem); if (unlikely(vmap->vm_ops == NULL)) { err = VM_FAULT_SIGBUS; } else { struct vm_fault vmf; /* fill out VM fault structure */ vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; vmf.pgoff = 0; vmf.page = NULL; vmf.vma = vmap; vmap->vm_pfn_count = 0; vmap->vm_pfn_pcount = &vmap->vm_pfn_count; vmap->vm_obj = vm_obj; err = vmap->vm_ops->fault(vmap, &vmf); while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { kern_yield(PRI_USER); err = vmap->vm_ops->fault(vmap, &vmf); } } /* translate return code */ switch (err) { case VM_FAULT_OOM: err = VM_PAGER_AGAIN; break; case VM_FAULT_SIGBUS: err = VM_PAGER_BAD; break; case VM_FAULT_NOPAGE: /* * By contract the fault handler will return having * busied all the pages itself. If pidx is already * found in the object, it will simply xbusy the first * page and return with vm_pfn_count set to 1. */ *first = vmap->vm_pfn_first; *last = *first + vmap->vm_pfn_count - 1; err = VM_PAGER_OK; break; default: err = VM_PAGER_ERROR; break; } up_write(&vmap->vm_mm->mmap_sem); VM_OBJECT_WLOCK(vm_obj); return (err); } static struct rwlock linux_vma_lock; static TAILQ_HEAD(, vm_area_struct) linux_vma_head = TAILQ_HEAD_INITIALIZER(linux_vma_head); static void linux_cdev_handle_free(struct vm_area_struct *vmap) { /* Drop reference on vm_file */ if (vmap->vm_file != NULL) fput(vmap->vm_file); /* Drop reference on mm_struct */ mmput(vmap->vm_mm); kfree(vmap); } static void linux_cdev_handle_remove(struct vm_area_struct *vmap) { rw_wlock(&linux_vma_lock); TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); rw_wunlock(&linux_vma_lock); } static struct vm_area_struct * linux_cdev_handle_find(void *handle) { struct vm_area_struct *vmap; rw_rlock(&linux_vma_lock); TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { if (vmap->vm_private_data == handle) break; } rw_runlock(&linux_vma_lock); return (vmap); } static int linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { MPASS(linux_cdev_handle_find(handle) != NULL); *color = 0; return (0); } static void linux_cdev_pager_dtor(void *handle) { const struct vm_operations_struct *vm_ops; struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(handle); MPASS(vmap != NULL); /* * Remove handle before calling close operation to prevent * other threads from reusing the handle pointer. */ linux_cdev_handle_remove(vmap); down_write(&vmap->vm_mm->mmap_sem); vm_ops = vmap->vm_ops; if (likely(vm_ops != NULL)) vm_ops->close(vmap); up_write(&vmap->vm_mm->mmap_sem); linux_cdev_handle_free(vmap); } static struct cdev_pager_ops linux_cdev_pager_ops[2] = { { /* OBJT_MGTDEVICE */ .cdev_pg_populate = linux_cdev_pager_populate, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, { /* OBJT_DEVICE */ .cdev_pg_fault = linux_cdev_pager_fault, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, }; int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { vm_object_t obj; vm_page_t m; obj = vma->vm_obj; if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) return (-ENOTSUP); VM_OBJECT_RLOCK(obj); for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); m != NULL && m->pindex < OFF_TO_IDX(address + size); m = TAILQ_NEXT(m, listq)) pmap_remove_all(m); VM_OBJECT_RUNLOCK(obj); return (0); } static struct file_operations dummy_ldev_ops = { /* XXXKIB */ }; static struct linux_cdev dummy_ldev = { .ops = &dummy_ldev_ops, }; #define LDEV_SI_DTR 0x0001 #define LDEV_SI_REF 0x0002 static void linux_get_fop(struct linux_file *filp, const struct file_operations **fop, struct linux_cdev **dev) { struct linux_cdev *ldev; u_int siref; ldev = filp->f_cdev; *fop = filp->f_op; if (ldev != NULL) { for (siref = ldev->siref;;) { if ((siref & LDEV_SI_DTR) != 0) { ldev = &dummy_ldev; siref = ldev->siref; *fop = ldev->ops; MPASS((ldev->siref & LDEV_SI_DTR) == 0); } else if (atomic_fcmpset_int(&ldev->siref, &siref, siref + LDEV_SI_REF)) { break; } } } *dev = ldev; } static void linux_drop_fop(struct linux_cdev *ldev) { if (ldev == NULL) return; MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); atomic_subtract_int(&ldev->siref, LDEV_SI_REF); } #define OPW(fp,td,code) ({ \ struct file *__fpop; \ __typeof(code) __retval; \ \ __fpop = (td)->td_fpop; \ (td)->td_fpop = (fp); \ __retval = (code); \ (td)->td_fpop = __fpop; \ __retval; \ }) static int linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file) { struct linux_cdev *ldev; struct linux_file *filp; const struct file_operations *fop; int error; ldev = dev->si_drv1; filp = linux_file_alloc(); filp->f_dentry = &filp->f_dentry_store; filp->f_op = ldev->ops; filp->f_mode = file->f_flag; filp->f_flags = file->f_flag; filp->f_vnode = file->f_vnode; filp->_file = file; refcount_acquire(&ldev->refs); filp->f_cdev = ldev; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->open != NULL) { error = -fop->open(file->f_vnode, filp); if (error != 0) { linux_drop_fop(ldev); linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } } /* hold on to the vnode - used for fstat() */ vhold(filp->f_vnode); /* release the file from devfs */ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); linux_drop_fop(ldev); return (ENXIO); } #define LINUX_IOCTL_MIN_PTR 0x10000UL #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) static inline int linux_remap_address(void **uaddr, size_t len) { uintptr_t uaddr_val = (uintptr_t)(*uaddr); if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && uaddr_val < LINUX_IOCTL_MAX_PTR)) { struct task_struct *pts = current; if (pts == NULL) { *uaddr = NULL; return (1); } /* compute data offset */ uaddr_val -= LINUX_IOCTL_MIN_PTR; /* check that length is within bounds */ if ((len > IOCPARM_MAX) || (uaddr_val + len) > pts->bsd_ioctl_len) { *uaddr = NULL; return (1); } /* re-add kernel buffer address */ uaddr_val += (uintptr_t)pts->bsd_ioctl_data; /* update address location */ *uaddr = (void *)uaddr_val; return (1); } return (0); } int linux_copyin(const void *uaddr, void *kaddr, size_t len) { if (linux_remap_address(__DECONST(void **, &uaddr), len)) { if (uaddr == NULL) return (-EFAULT); memcpy(kaddr, uaddr, len); return (0); } return (-copyin(uaddr, kaddr, len)); } int linux_copyout(const void *kaddr, void *uaddr, size_t len) { if (linux_remap_address(&uaddr, len)) { if (uaddr == NULL) return (-EFAULT); memcpy(uaddr, kaddr, len); return (0); } return (-copyout(kaddr, uaddr, len)); } size_t linux_clear_user(void *_uaddr, size_t _len) { uint8_t *uaddr = _uaddr; size_t len = _len; /* make sure uaddr is aligned before going into the fast loop */ while (((uintptr_t)uaddr & 7) != 0 && len > 7) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } /* zero 8 bytes at a time */ while (len > 7) { #ifdef __LP64__ if (suword64(uaddr, 0)) return (_len); #else if (suword32(uaddr, 0)) return (_len); if (suword32(uaddr + 4, 0)) return (_len); #endif uaddr += 8; len -= 8; } /* zero fill end, if any */ while (len > 0) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } return (0); } int linux_access_ok(int rw, const void *uaddr, size_t len) { uintptr_t saddr; uintptr_t eaddr; /* get start and end address */ saddr = (uintptr_t)uaddr; eaddr = (uintptr_t)uaddr + len; /* verify addresses are valid for userspace */ return ((saddr == eaddr) || (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); } /* * This function should return either EINTR or ERESTART depending on * the signal type sent to this thread: */ static int linux_get_error(struct task_struct *task, int error) { /* check for signal type interrupt code */ if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { error = -linux_schedule_get_interrupt_value(task); if (error == 0) error = EINTR; } return (error); } static int linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, const struct file_operations *fop, u_long cmd, caddr_t data, struct thread *td) { struct task_struct *task = current; unsigned size; int error; size = IOCPARM_LEN(cmd); /* refer to logic in sys_ioctl() */ if (size > 0) { /* * Setup hint for linux_copyin() and linux_copyout(). * * Background: Linux code expects a user-space address * while FreeBSD supplies a kernel-space address. */ task->bsd_ioctl_data = data; task->bsd_ioctl_len = size; data = (void *)LINUX_IOCTL_MIN_PTR; } else { /* fetch user-space pointer */ data = *(void **)data; } #if defined(__amd64__) if (td->td_proc->p_elf_machine == EM_386) { /* try the compat IOCTL handler first */ if (fop->compat_ioctl != NULL) { error = -OPW(fp, td, fop->compat_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } /* fallback to the regular IOCTL handler, if any */ if (error == ENOTTY && fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } } else #endif { if (fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } } if (size > 0) { task->bsd_ioctl_data = NULL; task->bsd_ioctl_len = 0; } if (error == EWOULDBLOCK) { /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } else { error = linux_get_error(task, error); } return (error); } #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) /* * This function atomically updates the poll wakeup state and returns * the previous state at the time of update. */ static uint8_t linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) { int c, old; c = v->counter; while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) c = old; return (c); } static int linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ }; struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_QUEUED: linux_poll_wakeup(filp); return (1); default: return (0); } } void linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, }; /* check if we are called inside the select system call */ if (p == LINUX_POLL_TABLE_NORMAL) selrecord(curthread, &filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_INIT: /* NOTE: file handles can only belong to one wait-queue */ filp->f_wait_queue.wqh = wqh; filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; add_wait_queue(wqh, &filp->f_wait_queue.wq); atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); break; default: break; } } static void linux_poll_wait_dequeue(struct linux_file *filp) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, }; seldrain(&filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_NOT_READY: case LINUX_FWQ_STATE_QUEUED: case LINUX_FWQ_STATE_READY: remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); break; default: break; } } void linux_poll_wakeup(struct linux_file *filp) { /* this function should be NULL-safe */ if (filp == NULL) return; selwakeup(&filp->f_selinfo); spin_lock(&filp->f_kqlock); filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); spin_unlock(&filp->f_kqlock); } static void linux_file_kqfilter_detach(struct knote *kn) { struct linux_file *filp = kn->kn_hook; spin_lock(&filp->f_kqlock); knlist_remove(&filp->f_selinfo.si_note, kn, 1); spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter_read_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock.m, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); } static int linux_file_kqfilter_write_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock.m, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); } static struct filterops linux_dev_kqfiltops_read = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_read_event, }; static struct filterops linux_dev_kqfiltops_write = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_write_event, }; static void linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) { struct thread *td; const struct file_operations *fop; struct linux_cdev *ldev; int temp; if ((filp->f_kqflags & kqflags) == 0) return; td = curthread; linux_get_fop(filp, &fop, &ldev); /* get the latest polling state */ temp = OPW(filp->_file, td, fop->poll(filp, NULL)); linux_drop_fop(ldev); spin_lock(&filp->f_kqlock); /* clear kqflags */ filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE); /* update kqflags */ if ((temp & (POLLIN | POLLOUT)) != 0) { if ((temp & POLLIN) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; if ((temp & POLLOUT) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); } spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter(struct file *file, struct knote *kn) { struct linux_file *filp; struct thread *td; int error; td = curthread; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; if (filp->f_op->poll == NULL) return (EINVAL); spin_lock(&filp->f_kqlock); switch (kn->kn_filter) { case EVFILT_READ: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; kn->kn_fop = &linux_dev_kqfiltops_read; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; case EVFILT_WRITE: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; kn->kn_fop = &linux_dev_kqfiltops_write; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; default: error = EINVAL; break; } spin_unlock(&filp->f_kqlock); if (error == 0) { linux_set_current(td); /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } return (error); } static int linux_file_mmap_single(struct file *fp, const struct file_operations *fop, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot, struct thread *td) { struct task_struct *task; struct vm_area_struct *vmap; struct mm_struct *mm; struct linux_file *filp; vm_memattr_t attr; int error; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; if (fop->mmap == NULL) return (EOPNOTSUPP); linux_set_current(td); /* * The same VM object might be shared by multiple processes * and the mm_struct is usually freed when a process exits. * * The atomic reference below makes sure the mm_struct is * available as long as the vmap is in the linux_vma_head. */ task = current; mm = task->mm; if (atomic_inc_not_zero(&mm->mm_users) == 0) return (EINVAL); vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); vmap->vm_start = 0; vmap->vm_end = size; vmap->vm_pgoff = *offset / PAGE_SIZE; vmap->vm_pfn = 0; vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); vmap->vm_ops = NULL; vmap->vm_file = get_file(filp); vmap->vm_mm = mm; if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { error = linux_get_error(task, EINTR); } else { error = -OPW(fp, td, fop->mmap(filp, vmap)); error = linux_get_error(task, error); up_write(&vmap->vm_mm->mmap_sem); } if (error != 0) { linux_cdev_handle_free(vmap); return (error); } attr = pgprot2cachemode(vmap->vm_page_prot); if (vmap->vm_ops != NULL) { struct vm_area_struct *ptr; void *vm_private_data; bool vm_no_fault; if (vmap->vm_ops->open == NULL || vmap->vm_ops->close == NULL || vmap->vm_private_data == NULL) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); return (EINVAL); } vm_private_data = vmap->vm_private_data; rw_wlock(&linux_vma_lock); TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { if (ptr->vm_private_data == vm_private_data) break; } /* check if there is an existing VM area struct */ if (ptr != NULL) { /* check if the VM area structure is invalid */ if (ptr->vm_ops == NULL || ptr->vm_ops->open == NULL || ptr->vm_ops->close == NULL) { error = ESTALE; vm_no_fault = 1; } else { error = EEXIST; vm_no_fault = (ptr->vm_ops->fault == NULL); } } else { /* insert VM area structure into list */ TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); error = 0; vm_no_fault = (vmap->vm_ops->fault == NULL); } rw_wunlock(&linux_vma_lock); if (error != 0) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); /* check for stale VM area struct */ if (error != EEXIST) return (error); } /* check if there is no fault handler */ if (vm_no_fault) { *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, &linux_cdev_pager_ops[1], size, nprot, *offset, td->td_ucred); } else { *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, &linux_cdev_pager_ops[0], size, nprot, *offset, td->td_ucred); } /* check if allocating the VM object failed */ if (*object == NULL) { if (error == 0) { /* remove VM area struct from list */ linux_cdev_handle_remove(vmap); /* free allocated VM area struct */ linux_cdev_handle_free(vmap); } return (EINVAL); } } else { struct sglist *sg; sg = sglist_alloc(1, M_WAITOK); sglist_append_phys(sg, (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, nprot, 0, td->td_ucred); linux_cdev_handle_free(vmap); if (*object == NULL) { sglist_free(sg); return (EINVAL); } } if (attr != VM_MEMATTR_DEFAULT) { VM_OBJECT_WLOCK(*object); vm_object_set_memattr(*object, attr); VM_OBJECT_WUNLOCK(*object); } *offset = 0; return (0); } struct cdevsw linuxcdevsw = { .d_version = D_VERSION, .d_fdopen = linux_dev_fdopen, .d_name = "lkpidev", }; static int linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->read != NULL) { bytes = OPW(file, td, fop->read(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); linux_drop_fop(ldev); return (error); } static int linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->write != NULL) { bytes = OPW(file, td, fop->write(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; error = 0; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); linux_drop_fop(ldev); return (error); } static int linux_file_poll(struct file *file, int events, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->poll != NULL) { revents = OPW(file, td, fop->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events; } else { revents = 0; } linux_drop_fop(ldev); return (revents); } static int linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; int error; filp = (struct linux_file *)file->f_data; KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp))); error = 0; filp->f_flags = file->f_flag; linux_set_current(td); linux_poll_wait_dequeue(filp); linux_get_fop(filp, &fop, &ldev); if (fop->release != NULL) error = -OPW(file, td, fop->release(filp->f_vnode, filp)); funsetown(&filp->f_sigio); if (filp->f_vnode != NULL) vdrop(filp->f_vnode); linux_drop_fop(ldev); if (filp->f_cdev != NULL) linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } static int linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; int error; error = 0; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; linux_get_fop(filp, &fop, &ldev); linux_set_current(td); switch (cmd) { case FIONBIO: break; case FIOASYNC: if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) { if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); } break; case FIOGETOWN: *(int *)data = fgetown(&filp->f_sigio); break; default: error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); break; } linux_drop_fop(ldev); return (error); } static int linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct file *fp, vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) { /* * Character devices do not provide private mappings * of any kind: */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0) return (EINVAL); return (linux_file_mmap_single(fp, fop, foff, objsize, objp, (int)prot, td)); } static int linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; int error; filp = (struct linux_file *)fp->f_data; vp = filp->f_vnode; if (vp == NULL) return (EOPNOTSUPP); /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE * requests rather than doing it here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; linux_get_fop(filp, &fop, &ldev); error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, &foff, fop, &object); if (error != 0) goto out; error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); out: linux_drop_fop(ldev); return (error); } static int linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; struct vnode *vp; int error; filp = (struct linux_file *)fp->f_data; if (filp->f_vnode == NULL) return (EOPNOTSUPP); vp = filp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); error = vn_stat(vp, sb, td->td_ucred, NOCRED, td); VOP_UNLOCK(vp, 0); return (error); } static int linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct linux_file *filp; struct vnode *vp; int error; filp = fp->f_data; vp = filp->f_vnode; if (vp == NULL) { error = 0; kif->kf_type = KF_TYPE_DEV; } else { vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); kif->kf_type = KF_TYPE_VNODE; FILEDESC_SLOCK(fdp); } return (error); } unsigned int linux_iminor(struct inode *inode) { struct linux_cdev *ldev; if (inode == NULL || inode->v_rdev == NULL || inode->v_rdev->si_devsw != &linuxcdevsw) return (-1U); ldev = inode->v_rdev->si_drv1; if (ldev == NULL) return (-1U); return (minor(ldev->dev)); } struct fileops linuxfileops = { .fo_read = linux_file_read, .fo_write = linux_file_write, .fo_truncate = invfo_truncate, .fo_kqfilter = linux_file_kqfilter, .fo_stat = linux_file_stat, .fo_fill_kinfo = linux_file_fill_kinfo, .fo_poll = linux_file_poll, .fo_close = linux_file_close, .fo_ioctl = linux_file_ioctl, .fo_mmap = linux_file_mmap, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_flags = DFLAG_PASSABLE, }; /* * Hash of vmmap addresses. This is infrequently accessed and does not * need to be particularly large. This is done because we must store the * caller's idea of the map size to properly unmap. */ struct vmmap { LIST_ENTRY(vmmap) vm_next; void *vm_addr; unsigned long vm_size; }; struct vmmaphd { struct vmmap *lh_first; }; #define VMMAP_HASH_SIZE 64 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; static struct mtx vmmaplock; static void vmmap_add(void *addr, unsigned long size) { struct vmmap *vmmap; vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); mtx_lock(&vmmaplock); vmmap->vm_size = size; vmmap->vm_addr = addr; LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); mtx_unlock(&vmmaplock); } static struct vmmap * vmmap_remove(void *addr) { struct vmmap *vmmap; mtx_lock(&vmmaplock); LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) if (vmmap->vm_addr == addr) break; if (vmmap) LIST_REMOVE(vmmap, vm_next); mtx_unlock(&vmmaplock); return (vmmap); } #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) void * _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) { void *addr; addr = pmap_mapdev_attr(phys_addr, size, attr); if (addr == NULL) return (NULL); vmmap_add(addr, size); return (addr); } #endif void iounmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size); #endif kfree(vmmap); } void * vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) { vm_offset_t off; size_t size; size = count * PAGE_SIZE; off = kva_alloc(size); if (off == 0) return (NULL); vmmap_add((void *)off, size); pmap_qenter(off, pages, count); return ((void *)off); } void vunmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); kva_free((vm_offset_t)addr, vmmap->vm_size); kfree(vmmap); } char * kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc(len + 1, gfp); if (p != NULL) vsnprintf(p, len + 1, fmt, ap); return (p); } char * kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return (p); } static void linux_timer_callback_wrapper(void *context) { struct timer_list *timer; linux_set_current(curthread); timer = context; timer->function(timer->data); } void mod_timer(struct timer_list *timer, int expires) { timer->expires = expires; callout_reset(&timer->callout, linux_timer_jiffies_until(expires), &linux_timer_callback_wrapper, timer); } void add_timer(struct timer_list *timer) { callout_reset(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer); } void add_timer_on(struct timer_list *timer, int cpu) { callout_reset_on(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer, cpu); } +int +del_timer(struct timer_list *timer) +{ + + if (callout_stop(&(timer)->callout) == -1) + return (0); + return (1); +} + static void linux_timer_init(void *arg) { /* * Compute an internal HZ value which can divide 2**32 to * avoid timer rounding problems when the tick value wraps * around 2**32: */ linux_timer_hz_mask = 1; while (linux_timer_hz_mask < (unsigned long)hz) linux_timer_hz_mask *= 2; linux_timer_hz_mask--; } SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); void linux_complete_common(struct completion *c, int all) { int wakeup_swapper; sleepq_lock(c); if (all) { c->done = UINT_MAX; wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); } else { if (c->done != UINT_MAX) c->done++; wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); } sleepq_release(c); if (wakeup_swapper) kick_proc0(); } /* * Indefinite wait for done != 0 with or without signals. */ int linux_wait_for_common(struct completion *c, int flags) { struct task_struct *task; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; error = 0; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); if (flags & SLEEPQ_INTERRUPTIBLE) { DROP_GIANT(); error = -sleepq_wait_sig(c, 0); PICKUP_GIANT(); if (error != 0) { linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; goto intr; } } else { DROP_GIANT(); sleepq_wait(c, 0); PICKUP_GIANT(); } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); intr: return (error); } /* * Time limited wait for done != 0 with or without signals. */ int linux_wait_for_timeout_common(struct completion *c, int timeout, int flags) { struct task_struct *task; int end = jiffies + timeout; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); sleepq_set_timeout(c, linux_timer_jiffies_until(end)); DROP_GIANT(); if (flags & SLEEPQ_INTERRUPTIBLE) error = -sleepq_timedwait_sig(c, 0); else error = -sleepq_timedwait(c, 0); PICKUP_GIANT(); if (error != 0) { /* check for timeout */ if (error == -EWOULDBLOCK) { error = 0; /* timeout */ } else { /* signal happened */ linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; } goto done; } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); /* return how many jiffies are left */ error = linux_timer_jiffies_until(end); done: return (error); } int linux_try_wait_for_completion(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); if (c->done != 0 && c->done != UINT_MAX) c->done--; sleepq_release(c); return (isdone); } int linux_completion_done(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); sleepq_release(c); return (isdone); } static void linux_cdev_deref(struct linux_cdev *ldev) { if (refcount_release(&ldev->refs)) kfree(ldev); } static void linux_cdev_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); linux_cdev_deref(cdev); kobject_put(parent); } static void linux_cdev_static_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); kobject_put(parent); } void linux_destroy_dev(struct linux_cdev *ldev) { if (ldev->cdev == NULL) return; MPASS((ldev->siref & LDEV_SI_DTR) == 0); atomic_set_int(&ldev->siref, LDEV_SI_DTR); while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) pause("ldevdtr", hz / 4); destroy_dev(ldev->cdev); ldev->cdev = NULL; } const struct kobj_type linux_cdev_ktype = { .release = linux_cdev_release, }; const struct kobj_type linux_cdev_static_ktype = { .release = linux_cdev_static_release, }; static void linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) { struct notifier_block *nb; nb = arg; if (linkstate == LINK_STATE_UP) nb->notifier_call(nb, NETDEV_UP, ifp); else nb->notifier_call(nb, NETDEV_DOWN, ifp); } static void linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_REGISTER, ifp); } static void linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_UNREGISTER, ifp); } static void linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp); } static void linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; nb = arg; nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp); } int register_netdevice_notifier(struct notifier_block *nb) { nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( iflladdr_event, linux_handle_iflladdr_event, nb, 0); return (0); } int register_inetaddr_notifier(struct notifier_block *nb) { nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( ifaddr_event, linux_handle_ifaddr_event, nb, 0); return (0); } int unregister_netdevice_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]); EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nb->tags[NETDEV_UNREGISTER]); EVENTHANDLER_DEREGISTER(iflladdr_event, nb->tags[NETDEV_CHANGEADDR]); return (0); } int unregister_inetaddr_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifaddr_event, nb->tags[NETDEV_CHANGEIFADDR]); return (0); } struct list_sort_thunk { int (*cmp)(void *, struct list_head *, struct list_head *); void *priv; }; static inline int linux_le_cmp(void *priv, const void *d1, const void *d2) { struct list_head *le1, *le2; struct list_sort_thunk *thunk; thunk = priv; le1 = *(__DECONST(struct list_head **, d1)); le2 = *(__DECONST(struct list_head **, d2)); return ((thunk->cmp)(thunk->priv, le1, le2)); } void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { struct list_sort_thunk thunk; struct list_head **ar, *le; size_t count, i; count = 0; list_for_each(le, head) count++; ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); i = 0; list_for_each(le, head) ar[i++] = le; thunk.cmp = cmp; thunk.priv = priv; qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp); INIT_LIST_HEAD(head); for (i = 0; i < count; i++) list_add_tail(ar[i], head); free(ar, M_KMALLOC); } void linux_irq_handler(void *ent) { struct irq_ent *irqe; linux_set_current(curthread); irqe = ent; irqe->handler(irqe->irq, irqe->arg); } #if defined(__i386__) || defined(__amd64__) int linux_wbinvd_on_all_cpus(void) { pmap_invalidate_cache(); return (0); } #endif int linux_on_each_cpu(void callback(void *), void *data) { smp_rendezvous(smp_no_rendezvous_barrier, callback, smp_no_rendezvous_barrier, data); return (0); } int linux_in_atomic(void) { return ((curthread->td_pflags & TDP_NOFAULTING) != 0); } struct linux_cdev * linux_find_cdev(const char *name, unsigned major, unsigned minor) { dev_t dev = MKDEV(major, minor); struct cdev *cdev; dev_lock(); LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { struct linux_cdev *ldev = cdev->si_drv1; if (ldev->dev == dev && strcmp(kobject_name(&ldev->kobj), name) == 0) { break; } } dev_unlock(); return (cdev != NULL ? cdev->si_drv1 : NULL); } int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add(cdev, makedev(major, i), 1); if (ret != 0) break; } return (ret); } int __register_chrdev_p(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops, uid_t uid, gid_t gid, int mode) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); if (ret != 0) break; } return (ret); } void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct linux_cdev *cdevp; int i; for (i = baseminor; i < baseminor + count; i++) { cdevp = linux_find_cdev(name, major, i); if (cdevp != NULL) cdev_del(cdevp); } } void linux_dump_stack(void) { #ifdef STACK struct stack st; stack_zero(&st); stack_save(&st); stack_print(&st); #endif } #if defined(__i386__) || defined(__amd64__) bool linux_cpu_has_clflush; #endif static void linux_compat_init(void *arg) { struct sysctl_oid *rootoid; int i; #if defined(__i386__) || defined(__amd64__) linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); #endif rw_init(&linux_vma_lock, "lkpi-vma-lock"); rootoid = SYSCTL_ADD_ROOT_NODE(NULL, OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); kobject_init(&linux_class_root, &linux_class_ktype); kobject_set_name(&linux_class_root, "class"); linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); kobject_init(&linux_root_device.kobj, &linux_dev_ktype); kobject_set_name(&linux_root_device.kobj, "device"); linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL, "device"); linux_root_device.bsddev = root_bus; linux_class_misc.name = "misc"; class_register(&linux_class_misc); INIT_LIST_HEAD(&pci_drivers); INIT_LIST_HEAD(&pci_devices); spin_lock_init(&pci_lock); mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); for (i = 0; i < VMMAP_HASH_SIZE; i++) LIST_INIT(&vmmaphead[i]); } SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); static void linux_compat_uninit(void *arg) { linux_kobject_kfree_name(&linux_class_root); linux_kobject_kfree_name(&linux_root_device.kobj); linux_kobject_kfree_name(&linux_class_misc.kobj); mtx_destroy(&vmmaplock); spin_lock_destroy(&pci_lock); rw_destroy(&linux_vma_lock); } SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); /* * NOTE: Linux frequently uses "unsigned long" for pointer to integer * conversion and vice versa, where in FreeBSD "uintptr_t" would be * used. Assert these types have the same size, else some parts of the * LinuxKPI may not work like expected: */ CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); Index: projects/runtime-coverage-v2/sys/conf/kern.pre.mk =================================================================== --- projects/runtime-coverage-v2/sys/conf/kern.pre.mk (revision 347598) +++ projects/runtime-coverage-v2/sys/conf/kern.pre.mk (revision 347599) @@ -1,320 +1,320 @@ # $FreeBSD$ # Part of a unified Makefile for building kernels. This part contains all # of the definitions that need to be before %BEFORE_DEPEND. # Allow user to configure things that only effect src tree builds. # Note: This is duplicated from src.sys.mk to ensure that we include # /etc/src.conf when building the kernel. Kernels can be built without # the rest of /usr/src, but they still always process SRCCONF even though # the normal mechanisms to prevent that (compiling out of tree) won't # work. To ensure they do work, we have to duplicate thee few lines here. SRCCONF?= /etc/src.conf .if (exists(${SRCCONF}) || ${SRCCONF} != "/etc/src.conf") && !target(_srcconf_included_) .include "${SRCCONF}" _srcconf_included_: .endif .include .include .include "kern.opts.mk" # The kernel build always occurs in the object directory which is .CURDIR. .if ${.MAKE.MODE:Unormal:Mmeta} .MAKE.MODE+= curdirOk=yes .endif # The kernel build always expects .OBJDIR=.CURDIR. .OBJDIR: ${.CURDIR} .if defined(NO_OBJWALK) || ${MK_AUTO_OBJ} == "yes" NO_OBJWALK= t NO_MODULES_OBJ= t .endif .if !defined(NO_OBJWALK) _obj= obj .endif # Can be overridden by makeoptions or /etc/make.conf KERNEL_KO?= kernel KERNEL?= kernel KODIR?= /boot/${KERNEL} LDSCRIPT_NAME?= ldscript.$M LDSCRIPT?= $S/conf/${LDSCRIPT_NAME} M= ${MACHINE} AWK?= awk CP?= cp NM?= nm OBJCOPY?= objcopy SIZE?= size .if defined(DEBUG) _MINUS_O= -O CTFFLAGS+= -g .else .if ${MACHINE_CPUARCH} == "powerpc" _MINUS_O= -O # gcc miscompiles some code at -O2 .else _MINUS_O= -O2 .endif .endif .if ${MACHINE_CPUARCH} == "amd64" .if ${COMPILER_TYPE} == "clang" COPTFLAGS?=-O2 -pipe .else COPTFLAGS?=-O2 -frename-registers -pipe .endif .else COPTFLAGS?=${_MINUS_O} -pipe .endif .if !empty(COPTFLAGS:M-O[23s]) && empty(COPTFLAGS:M-fno-strict-aliasing) COPTFLAGS+= -fno-strict-aliasing .endif .if !defined(NO_CPU_COPTFLAGS) COPTFLAGS+= ${_CPUCFLAGS} .endif NOSTDINC= -nostdinc INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include CFLAGS= ${COPTFLAGS} ${DEBUG} CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100 CFLAGS_PARAM_LARGE_FUNCTION_GROWTH?=1000 .if ${MACHINE_CPUARCH} == "mips" CFLAGS_ARCH_PARAMS?=--param max-inline-insns-single=1000 -DMACHINE_ARCH='"${MACHINE_ARCH}"' .endif CFLAGS.gcc+= -fno-common -fms-extensions -finline-limit=${INLINE_LIMIT} CFLAGS.gcc+= --param inline-unit-growth=${CFLAGS_PARAM_INLINE_UNIT_GROWTH} CFLAGS.gcc+= --param large-function-growth=${CFLAGS_PARAM_LARGE_FUNCTION_GROWTH} CFLAGS.gcc+= -fms-extensions .if defined(CFLAGS_ARCH_PARAMS) CFLAGS.gcc+=${CFLAGS_ARCH_PARAMS} .endif WERROR?= -Werror # XXX LOCORE means "don't declare C stuff" not "for locore.s". ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${CFLAGS} ${ASM_CFLAGS.${.IMPSRC:T}} .if defined(PROFLEVEL) && ${PROFLEVEL} >= 1 CFLAGS+= -DGPROF CFLAGS.gcc+= -falign-functions=16 .if ${PROFLEVEL} >= 2 CFLAGS+= -DGPROF4 -DGUPROF PROF= -pg .if ${COMPILER_TYPE} == "gcc" PROF+= -mprofiler-epilogue .endif .else PROF= -pg .endif .endif DEFINED_PROF= ${PROF} KUBSAN_ENABLED!= grep KUBSAN opt_global.h || true ; echo .if !empty(KUBSAN_ENABLED) SAN_CFLAGS+= -fsanitize=undefined .endif COVERAGE_ENABLED!= grep COVERAGE opt_global.h || true ; echo .if !empty(COVERAGE_ENABLED) .if ${COMPILER_TYPE} == "clang" || \ (${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 80100) SAN_CFLAGS+= -fsanitize-coverage=trace-pc,trace-cmp .else SAN_CFLAGS+= -fsanitize-coverage=trace-pc .endif .endif CFLAGS+= ${SAN_CFLAGS} GCOV_ENABLED!= grep GCOV opt_global.h || true ; echo .if !empty(GCOV_ENABLED) .if ${COMPILER_TYPE} == "gcc" GCOV_CFLAGS+= -fprofile-arcs -ftest-coverage .endif .endif CFLAGS+= ${GCOV_CFLAGS} # Put configuration-specific C flags last (except for ${PROF}) so that they # can override the others. CFLAGS+= ${CONF_CFLAGS} .if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mbuild-id} -LDFLAGS+= -Wl,--build-id=sha1 +LDFLAGS+= --build-id=sha1 .endif .if (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386") && \ defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mifunc} == "" .error amd64/arm64/i386 kernel requires linker ifunc support .endif .if ${MACHINE_CPUARCH} == "amd64" -LDFLAGS+= -Wl,-z max-page-size=2097152 +LDFLAGS+= -z max-page-size=2097152 .if ${LINKER_TYPE} != "lld" -LDFLAGS+= -Wl,-z common-page-size=4096 +LDFLAGS+= -z common-page-size=4096 .else -LDFLAGS+= -Wl,-z -Wl,ifunc-noplt +LDFLAGS+= -z notext -z ifunc-noplt .endif .endif NORMAL_C= ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} NORMAL_S= ${CC:N${CCACHE_BIN}} -c ${ASM_CFLAGS} ${WERROR} ${.IMPSRC} PROFILE_C= ${CC} -c ${CFLAGS} ${WERROR} ${.IMPSRC} NORMAL_C_NOWERROR= ${CC} -c ${CFLAGS} ${PROF} ${.IMPSRC} NORMAL_M= ${AWK} -f $S/tools/makeobjops.awk ${.IMPSRC} -c ; \ ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.PREFIX}.c NORMAL_FW= uudecode -o ${.TARGET} ${.ALLSRC} NORMAL_FWO= ${LD} -b binary --no-warn-mismatch -d -warn-common -r \ -m ${LD_EMULATION} -o ${.TARGET} ${.ALLSRC:M*.fw} # for ZSTD in the kernel (include zstd/lib/freebsd before other CFLAGS) ZSTD_C= ${CC} -c -DZSTD_HEAPMODE=1 -I$S/contrib/zstd/lib/freebsd ${CFLAGS} -I$S/contrib/zstd/lib -I$S/contrib/zstd/lib/common ${WERROR} -Wno-inline -Wno-missing-prototypes ${PROF} -U__BMI__ ${.IMPSRC} # Common for dtrace / zfs CDDL_CFLAGS= -DFREEBSD_NAMECACHE -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas CDDL_CFLAGS+= -include $S/cddl/compat/opensolaris/sys/debug_compat.h CDDL_C= ${CC} -c ${CDDL_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} # Special flags for managing the compat compiles for ZFS ZFS_CFLAGS= -DBUILDING_ZFS -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs/lua ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/common/zmod ZFS_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/zfs ZFS_CFLAGS+= ${CDDL_CFLAGS} ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS} ZFS_C= ${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} ZFS_S= ${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC} # Special flags for managing the compat compiles for DTrace DTRACE_CFLAGS= -DBUILDING_DTRACE ${CDDL_CFLAGS} -I$S/cddl/dev/dtrace -I$S/cddl/dev/dtrace/${MACHINE_CPUARCH} .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/uts/intel -I$S/cddl/dev/dtrace/x86 .endif DTRACE_CFLAGS+= -I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP DTRACE_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${DTRACE_CFLAGS} DTRACE_C= ${CC} -c ${DTRACE_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} DTRACE_S= ${CC} -c ${DTRACE_ASM_CFLAGS} ${WERROR} ${.IMPSRC} # Special flags for managing the compat compiles for DTrace/FBT FBT_CFLAGS= -DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S ${CDDL_CFLAGS} .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" FBT_CFLAGS+= -I$S/cddl/dev/fbt/x86 .endif FBT_C= ${CC} -c ${FBT_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} .if ${MK_CTF} != "no" NORMAL_CTFCONVERT= ${CTFCONVERT} ${CTFFLAGS} ${.TARGET} .elif ${MAKE_VERSION} >= 5201111300 NORMAL_CTFCONVERT= .else NORMAL_CTFCONVERT= @: .endif # Linux Kernel Programming Interface C-flags LINUXKPI_INCLUDES= -I$S/compat/linuxkpi/common/include LINUXKPI_C= ${NORMAL_C} ${LINUXKPI_INCLUDES} # Infiniband C flags. Correct include paths and omit errors that linux # does not honor. OFEDINCLUDES= -I$S/ofed/include -I$S/ofed/include/uapi ${LINUXKPI_INCLUDES} OFEDNOERR= -Wno-cast-qual -Wno-pointer-arith OFEDCFLAGS= ${CFLAGS:N-I*} -DCONFIG_INFINIBAND_USER_MEM \ ${OFEDINCLUDES} ${CFLAGS:M-I*} ${OFEDNOERR} OFED_C_NOIMP= ${CC} -c -o ${.TARGET} ${OFEDCFLAGS} ${WERROR} ${PROF} OFED_C= ${OFED_C_NOIMP} ${.IMPSRC} # mlxfw C flags. MLXFW_C= ${OFED_C_NOIMP} \ -I${SRCTOP}/sys/contrib/xz-embedded/freebsd \ -I${SRCTOP}/sys/contrib/xz-embedded/linux/lib/xz \ ${.IMPSRC} GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/} SYSTEM_CFILES= config.c env.c hints.c vnode_if.c SYSTEM_DEP= Makefile ${SYSTEM_OBJS} SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS} SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o} SYSTEM_OBJS+= hack.pico MD_ROOT_SIZE_CONFIGURED!= grep MD_ROOT_SIZE opt_md.h || true ; echo .if ${MFS_IMAGE:Uno} != "no" .if empty(MD_ROOT_SIZE_CONFIGURED) SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o .endif .endif SYSTEM_LD= @${LD} -m ${LD_EMULATION} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} \ --no-warn-mismatch --warn-common --export-dynamic \ --dynamic-linker /red/herring \ -o ${.TARGET} -X ${SYSTEM_OBJS} vers.o SYSTEM_LD_TAIL= @${OBJCOPY} --strip-symbol gcc2_compiled. ${.TARGET} ; \ ${SIZE} ${.TARGET} ; chmod 755 ${.TARGET} SYSTEM_DEP+= ${LDSCRIPT} # Calculate path for .m files early, if needed. .if !defined(NO_MODULES) && !defined(__MPATH) && !make(install) && \ (empty(.MAKEFLAGS:M-V) || defined(NO_SKIP_MPATH)) __MPATH!=find ${S:tA}/ -name \*_if.m .endif # MKMODULESENV is set here so that port makefiles can augment # them. MKMODULESENV+= MAKEOBJDIRPREFIX=${.OBJDIR}/modules KMODDIR=${KODIR} MKMODULESENV+= MACHINE_CPUARCH=${MACHINE_CPUARCH} MKMODULESENV+= MACHINE=${MACHINE} MACHINE_ARCH=${MACHINE_ARCH} MKMODULESENV+= MODULES_EXTRA="${MODULES_EXTRA}" WITHOUT_MODULES="${WITHOUT_MODULES}" MKMODULESENV+= ARCH_FLAGS="${ARCH_FLAGS}" .if (${KERN_IDENT} == LINT) MKMODULESENV+= ALL_MODULES=LINT .endif .if defined(MODULES_OVERRIDE) MKMODULESENV+= MODULES_OVERRIDE="${MODULES_OVERRIDE}" .endif .if defined(DEBUG) MKMODULESENV+= DEBUG_FLAGS="${DEBUG}" .endif .if !defined(NO_MODULES) MKMODULESENV+= __MPATH="${__MPATH}" .endif # Architecture and output format arguments for objcopy to convert image to # object file .if ${MFS_IMAGE:Uno} != "no" .if empty(MD_ROOT_SIZE_CONFIGURED) .if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH}) EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT} .if empty(EMBEDFS_FORMAT.${MACHINE_ARCH}) .undef EMBEDFS_FORMAT.${MACHINE_ARCH} .endif .endif .if !defined(EMBEDFS_ARCH.${MACHINE_ARCH}) EMBEDFS_ARCH.${MACHINE_ARCH}!= sed -n '/OUTPUT_ARCH/s/.*(\(.*\)).*/\1/p' ${LDSCRIPT} .if empty(EMBEDFS_ARCH.${MACHINE_ARCH}) .undef EMBEDFS_ARCH.${MACHINE_ARCH} .endif .endif EMBEDFS_FORMAT.arm?= elf32-littlearm EMBEDFS_FORMAT.armv6?= elf32-littlearm EMBEDFS_FORMAT.armv7?= elf32-littlearm EMBEDFS_FORMAT.aarch64?= elf64-littleaarch64 EMBEDFS_FORMAT.mips?= elf32-tradbigmips EMBEDFS_FORMAT.mipsel?= elf32-tradlittlemips EMBEDFS_FORMAT.mips64?= elf64-tradbigmips EMBEDFS_FORMAT.mips64el?= elf64-tradlittlemips EMBEDFS_FORMAT.riscv?= elf64-littleriscv .endif .endif # Detect kernel config options that force stack frames to be turned on. DDB_ENABLED!= grep DDB opt_ddb.h || true ; echo DTR_ENABLED!= grep KDTRACE_FRAME opt_kdtrace.h || true ; echo HWPMC_ENABLED!= grep HWPMC opt_hwpmc_hooks.h || true ; echo Index: projects/runtime-coverage-v2/sys/conf/kmod.mk =================================================================== --- projects/runtime-coverage-v2/sys/conf/kmod.mk (revision 347598) +++ projects/runtime-coverage-v2/sys/conf/kmod.mk (revision 347599) @@ -1,524 +1,524 @@ # From: @(#)bsd.prog.mk 5.26 (Berkeley) 6/25/91 # $FreeBSD$ # # The include file handles building and installing loadable # kernel modules. # # # +++ variables +++ # # CLEANFILES Additional files to remove for the clean and cleandir targets. # # EXPORT_SYMS A list of symbols that should be exported from the module, # or the name of a file containing a list of symbols, or YES # to export all symbols. If not defined, no symbols are # exported. # # KMOD The name of the kernel module to build. # # KMODDIR Base path for kernel modules (see kld(4)). [/boot/kernel] # # KMODOWN Module file owner. [${BINOWN}] # # KMODGRP Module file group. [${BINGRP}] # # KMODMODE Module file mode. [${BINMODE}] # # KMODLOAD Command to load a kernel module [/sbin/kldload] # # KMODUNLOAD Command to unload a kernel module [/sbin/kldunload] # # KMODISLOADED Command to check whether a kernel module is # loaded [/sbin/kldstat -q -n] # # PROG The name of the kernel module to build. # If not supplied, ${KMOD}.ko is used. # # SRCS List of source files. # # FIRMWS List of firmware images in format filename:shortname:version # # FIRMWARE_LICENSE # Set to the name of the license the user has to agree on in # order to use this firmware. See /usr/share/doc/legal # # DESTDIR The tree where the module gets installed. [not set] # # KERNBUILDDIR # Set to the location of the kernel build directory where # the opt_*.h files, .o's and kernel winds up. # # +++ targets +++ # # install: # install the kernel module; if the Makefile # does not itself define the target install, the targets # beforeinstall and afterinstall may also be used to cause # actions immediately before and after the install target # is executed. # # load: # Load a module. # # unload: # Unload a module. # # reload: # Unload if loaded, then load. # AWK?= awk KMODLOAD?= /sbin/kldload KMODUNLOAD?= /sbin/kldunload KMODISLOADED?= /sbin/kldstat -q -n OBJCOPY?= objcopy .include # Grab all the options for a kernel build. For backwards compat, we need to # do this after bsd.own.mk. .include "kern.opts.mk" .include .include "config.mk" # Search for kernel source tree in standard places. .if empty(KERNBUILDDIR) .if !defined(SYSDIR) .for _dir in ${SRCTOP:D${SRCTOP}/sys} \ ${.CURDIR}/../.. ${.CURDIR}/../../.. /sys /usr/src/sys .if !defined(SYSDIR) && exists(${_dir}/kern/) SYSDIR= ${_dir:tA} .endif .endfor .endif .if !defined(SYSDIR) || !exists(${SYSDIR}/kern/) .error "can't find kernel source tree" .endif .endif .SUFFIXES: .out .o .c .cc .cxx .C .y .l .s .S .m # amd64 and mips use direct linking for kmod, all others use shared binaries .if ${MACHINE_CPUARCH} != amd64 && ${MACHINE_CPUARCH} != mips __KLD_SHARED=yes .else __KLD_SHARED=no .endif .if !empty(CFLAGS:M-O[23s]) && empty(CFLAGS:M-fno-strict-aliasing) CFLAGS+= -fno-strict-aliasing .endif WERROR?= -Werror CFLAGS+= ${WERROR} CFLAGS+= -D_KERNEL CFLAGS+= -DKLD_MODULE .if defined(MODULE_TIED) CFLAGS+= -DKLD_TIED .endif # Don't use any standard or source-relative include directories. NOSTDINC= -nostdinc CFLAGS:= ${CFLAGS:N-I*} ${NOSTDINC} ${INCLMAGIC} ${CFLAGS:M-I*} .if defined(KERNBUILDDIR) CFLAGS+= -DHAVE_KERNEL_OPTION_HEADERS -include ${KERNBUILDDIR}/opt_global.h .endif # Add -I paths for system headers. Individual module makefiles don't # need any -I paths for this. Similar defaults for .PATH can't be # set because there are no standard paths for non-headers. CFLAGS+= -I. -I${SYSDIR} -I${SYSDIR}/contrib/ck/include CFLAGS.gcc+= -finline-limit=${INLINE_LIMIT} CFLAGS.gcc+= -fms-extensions CFLAGS.gcc+= --param inline-unit-growth=100 CFLAGS.gcc+= --param large-function-growth=1000 # Disallow common variables, and if we end up with commons from # somewhere unexpected, allocate storage for them in the module itself. CFLAGS+= -fno-common LDFLAGS+= -d -warn-common .if defined(LINKER_FEATURES) && ${LINKER_FEATURES:Mbuild-id} -LDFLAGS+= -Wl,--build-id=sha1 +LDFLAGS+= --build-id=sha1 .endif CFLAGS+= ${DEBUG_FLAGS} .if ${MACHINE_CPUARCH} == amd64 CFLAGS+= -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "riscv" CFLAGS+= -fPIC .endif # Temporary workaround for PR 196407, which contains the fascinating details. # Don't allow clang to use fpu instructions or registers in kernel modules. .if ${MACHINE_CPUARCH} == arm .if ${COMPILER_VERSION} < 30800 CFLAGS.clang+= -mllvm -arm-use-movt=0 .else CFLAGS.clang+= -mno-movt .endif CFLAGS.clang+= -mfpu=none CFLAGS+= -funwind-tables .endif .if ${MACHINE_CPUARCH} == powerpc CFLAGS+= -mlongcall -fno-omit-frame-pointer .endif .if ${MACHINE_CPUARCH} == mips CFLAGS+= -G0 -fno-pic -mno-abicalls -mlong-calls .endif .if defined(DEBUG) || defined(DEBUG_FLAGS) CTFFLAGS+= -g .endif .if defined(FIRMWS) ${KMOD:S/$/.c/}: ${SYSDIR}/tools/fw_stub.awk ${AWK} -f ${SYSDIR}/tools/fw_stub.awk ${FIRMWS} -m${KMOD} -c${KMOD:S/$/.c/g} \ ${FIRMWARE_LICENSE:C/.+/-l/}${FIRMWARE_LICENSE} SRCS+= ${KMOD:S/$/.c/} CLEANFILES+= ${KMOD:S/$/.c/} .for _firmw in ${FIRMWS} ${_firmw:C/\:.*$/.fwo/:T}: ${_firmw:C/\:.*$//} @${ECHO} ${_firmw:C/\:.*$//} ${.ALLSRC:M*${_firmw:C/\:.*$//}} @if [ -e ${_firmw:C/\:.*$//} ]; then \ ${LD} -b binary --no-warn-mismatch ${_LDFLAGS} \ -m ${LD_EMULATION} -r -d \ -o ${.TARGET} ${_firmw:C/\:.*$//}; \ else \ ln -s ${.ALLSRC:M*${_firmw:C/\:.*$//}} ${_firmw:C/\:.*$//}; \ ${LD} -b binary --no-warn-mismatch ${_LDFLAGS} \ -m ${LD_EMULATION} -r -d \ -o ${.TARGET} ${_firmw:C/\:.*$//}; \ rm ${_firmw:C/\:.*$//}; \ fi OBJS+= ${_firmw:C/\:.*$/.fwo/:T} .endfor .endif # Conditionally include SRCS based on kernel config options. .for _o in ${KERN_OPTS} SRCS+=${SRCS.${_o}} .endfor OBJS+= ${SRCS:N*.h:R:S/$/.o/g} .if !defined(PROG) PROG= ${KMOD}.ko .endif .if !defined(DEBUG_FLAGS) FULLPROG= ${PROG} .else FULLPROG= ${PROG}.full ${PROG}: ${FULLPROG} ${PROG}.debug ${OBJCOPY} --strip-debug --add-gnu-debuglink=${PROG}.debug \ ${FULLPROG} ${.TARGET} ${PROG}.debug: ${FULLPROG} ${OBJCOPY} --only-keep-debug ${FULLPROG} ${.TARGET} .endif .if ${__KLD_SHARED} == yes ${FULLPROG}: ${KMOD}.kld ${LD} -m ${LD_EMULATION} -Bshareable -znotext ${_LDFLAGS} \ -o ${.TARGET} ${KMOD}.kld .if !defined(DEBUG_FLAGS) ${OBJCOPY} --strip-debug ${.TARGET} .endif .endif EXPORT_SYMS?= NO .if ${EXPORT_SYMS} != YES CLEANFILES+= export_syms .endif .if ${__KLD_SHARED} == yes ${KMOD}.kld: ${OBJS} .else ${FULLPROG}: ${OBJS} .endif ${LD} -m ${LD_EMULATION} ${_LDFLAGS} -r -d -o ${.TARGET} ${OBJS} .if ${MK_CTF} != "no" ${CTFMERGE} ${CTFFLAGS} -o ${.TARGET} ${OBJS} .endif .if defined(EXPORT_SYMS) .if ${EXPORT_SYMS} != YES .if ${EXPORT_SYMS} == NO :> export_syms .elif !exists(${.CURDIR}/${EXPORT_SYMS}) echo -n "${EXPORT_SYMS:@s@$s${.newline}@}" > export_syms .else grep -v '^#' < ${EXPORT_SYMS} > export_syms .endif ${AWK} -f ${SYSDIR}/conf/kmod_syms.awk ${.TARGET} \ export_syms | xargs -J% ${OBJCOPY} % ${.TARGET} .endif .endif # defined(EXPORT_SYMS) .if defined(PREFIX_SYMS) ${AWK} -v prefix=${PREFIX_SYMS} -f ${SYSDIR}/conf/kmod_syms_prefix.awk \ ${.TARGET} /dev/null | xargs -J% ${OBJCOPY} % ${.TARGET} .endif .if !defined(DEBUG_FLAGS) && ${__KLD_SHARED} == no ${OBJCOPY} --strip-debug ${.TARGET} .endif .if ${COMPILER_TYPE} == "clang" || \ (${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 60000) _MAP_DEBUG_PREFIX= yes .endif _ILINKS=machine .if ${MACHINE} != ${MACHINE_CPUARCH} && ${MACHINE} != "arm64" _ILINKS+=${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _ILINKS+=x86 .endif CLEANFILES+=${_ILINKS} all: ${PROG} beforedepend: ${_ILINKS} beforebuild: ${_ILINKS} # Ensure that the links exist without depending on it when it exists which # causes all the modules to be rebuilt when the directory pointed to changes. # Ensure that debug info references the path in the source tree. .for _link in ${_ILINKS} .if !exists(${.OBJDIR}/${_link}) OBJS_DEPEND_GUESS+= ${_link} .endif .if defined(_MAP_DEBUG_PREFIX) .if ${_link} == "machine" CFLAGS+= -fdebug-prefix-map=./machine=${SYSDIR}/${MACHINE}/include .else CFLAGS+= -fdebug-prefix-map=./${_link}=${SYSDIR}/${_link}/include .endif .endif .endfor .NOPATH: ${_ILINKS} ${_ILINKS}: @case ${.TARGET} in \ machine) \ path=${SYSDIR}/${MACHINE}/include ;; \ *) \ path=${SYSDIR}/${.TARGET:T}/include ;; \ esac ; \ path=`(cd $$path && /bin/pwd)` ; \ ${ECHO} ${.TARGET:T} "->" $$path ; \ ln -fns $$path ${.TARGET:T} CLEANFILES+= ${PROG} ${KMOD}.kld ${OBJS} .if defined(DEBUG_FLAGS) CLEANFILES+= ${FULLPROG} ${PROG}.debug .endif .if !target(install) _INSTALLFLAGS:= ${INSTALLFLAGS} .for ie in ${INSTALLFLAGS_EDIT} _INSTALLFLAGS:= ${_INSTALLFLAGS${ie}} .endfor .if !target(realinstall) KERN_DEBUGDIR?= ${DEBUGDIR} realinstall: _kmodinstall .ORDER: beforeinstall _kmodinstall _kmodinstall: .PHONY ${INSTALL} -T release -o ${KMODOWN} -g ${KMODGRP} -m ${KMODMODE} \ ${_INSTALLFLAGS} ${PROG} ${DESTDIR}${KMODDIR}/ .if defined(DEBUG_FLAGS) && !defined(INSTALL_NODEBUG) && ${MK_KERNEL_SYMBOLS} != "no" ${INSTALL} -T debug -o ${KMODOWN} -g ${KMODGRP} -m ${KMODMODE} \ ${_INSTALLFLAGS} ${PROG}.debug ${DESTDIR}${KERN_DEBUGDIR}${KMODDIR}/ .endif .include .if !defined(NO_XREF) afterinstall: _kldxref .ORDER: realinstall _kldxref .ORDER: _installlinks _kldxref _kldxref: .PHONY @if type kldxref >/dev/null 2>&1; then \ ${ECHO} kldxref ${DESTDIR}${KMODDIR}; \ kldxref ${DESTDIR}${KMODDIR}; \ fi .endif .endif # !target(realinstall) .endif # !target(install) .if !target(load) load: ${PROG} .PHONY ${KMODLOAD} -v ${.OBJDIR}/${PROG} .endif .if !target(unload) unload: .PHONY if ${KMODISLOADED} ${PROG} ; then ${KMODUNLOAD} -v ${PROG} ; fi .endif .if !target(reload) reload: unload load .PHONY .endif .if defined(KERNBUILDDIR) .PATH: ${KERNBUILDDIR} CFLAGS+= -I${KERNBUILDDIR} .for _src in ${SRCS:Mopt_*.h} CLEANFILES+= ${_src} .if !target(${_src}) ${_src}: ln -sf ${KERNBUILDDIR}/${_src} ${.TARGET} .endif .endfor .else .for _src in ${SRCS:Mopt_*.h} CLEANFILES+= ${_src} .if !target(${_src}) ${_src}: :> ${.TARGET} .endif .endfor .endif # Add the sanitizer C flags CFLAGS+= ${SAN_CFLAGS} # Add the gcov flags CFLAGS+= ${GCOV_CFLAGS} # Respect configuration-specific C flags. CFLAGS+= ${ARCH_FLAGS} ${CONF_CFLAGS} .if !empty(SRCS:Mvnode_if.c) CLEANFILES+= vnode_if.c vnode_if.c: ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src ${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -c .endif .if !empty(SRCS:Mvnode_if.h) CLEANFILES+= vnode_if.h vnode_if_newproto.h vnode_if_typedef.h vnode_if.h vnode_if_newproto.h vnode_if_typedef.h: ${SYSDIR}/tools/vnode_if.awk \ ${SYSDIR}/kern/vnode_if.src vnode_if.h: vnode_if_newproto.h vnode_if_typedef.h ${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -h vnode_if_newproto.h: ${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -p vnode_if_typedef.h: ${AWK} -f ${SYSDIR}/tools/vnode_if.awk ${SYSDIR}/kern/vnode_if.src -q .endif # Build _if.[ch] from _if.m, and clean them when we're done. # __MPATH defined in config.mk _MFILES=${__MPATH:T:O} _MPATH=${__MPATH:H:O:u} .PATH.m: ${_MPATH} .for _i in ${SRCS:M*_if.[ch]} _MATCH=M${_i:R:S/$/.m/} _MATCHES=${_MFILES:${_MATCH}} .if !empty(_MATCHES) CLEANFILES+= ${_i} .endif .endfor # _i .m.c: ${SYSDIR}/tools/makeobjops.awk ${AWK} -f ${SYSDIR}/tools/makeobjops.awk ${.IMPSRC} -c .m.h: ${SYSDIR}/tools/makeobjops.awk ${AWK} -f ${SYSDIR}/tools/makeobjops.awk ${.IMPSRC} -h .for _i in mii pccard .if !empty(SRCS:M${_i}devs.h) CLEANFILES+= ${_i}devs.h ${_i}devs.h: ${SYSDIR}/tools/${_i}devs2h.awk ${SYSDIR}/dev/${_i}/${_i}devs ${AWK} -f ${SYSDIR}/tools/${_i}devs2h.awk ${SYSDIR}/dev/${_i}/${_i}devs .endif .endfor # _i .if !empty(SRCS:Mbhnd_nvram_map.h) CLEANFILES+= bhnd_nvram_map.h bhnd_nvram_map.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \ ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \ ${SYSDIR}/dev/bhnd/nvram/nvram_map bhnd_nvram_map.h: sh ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \ ${SYSDIR}/dev/bhnd/nvram/nvram_map -h .endif .if !empty(SRCS:Mbhnd_nvram_map_data.h) CLEANFILES+= bhnd_nvram_map_data.h bhnd_nvram_map_data.h: ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.awk \ ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \ ${SYSDIR}/dev/bhnd/nvram/nvram_map bhnd_nvram_map_data.h: sh ${SYSDIR}/dev/bhnd/tools/nvram_map_gen.sh \ ${SYSDIR}/dev/bhnd/nvram/nvram_map -d .endif .if !empty(SRCS:Musbdevs.h) CLEANFILES+= usbdevs.h usbdevs.h: ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs ${AWK} -f ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs -h .endif .if !empty(SRCS:Musbdevs_data.h) CLEANFILES+= usbdevs_data.h usbdevs_data.h: ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs ${AWK} -f ${SYSDIR}/tools/usbdevs2h.awk ${SYSDIR}/dev/usb/usbdevs -d .endif .if !empty(SRCS:Macpi_quirks.h) CLEANFILES+= acpi_quirks.h acpi_quirks.h: ${SYSDIR}/tools/acpi_quirks2h.awk ${SYSDIR}/dev/acpica/acpi_quirks ${AWK} -f ${SYSDIR}/tools/acpi_quirks2h.awk ${SYSDIR}/dev/acpica/acpi_quirks .endif .if !empty(SRCS:Massym.inc) || !empty(DPSRCS:Massym.inc) CLEANFILES+= assym.inc DEPENDOBJS+= genassym.o DPSRCS+= offset.inc .endif .if defined(MODULE_TIED) DPSRCS+= offset.inc .endif .if !empty(SRCS:Moffset.inc) || !empty(DPSRCS:Moffset.inc) CLEANFILES+= offset.inc genoffset.o DEPENDOBJS+= genoffset.o .endif assym.inc: genassym.o offset.inc: genoffset.o assym.inc: ${SYSDIR}/kern/genassym.sh sh ${SYSDIR}/kern/genassym.sh genassym.o > ${.TARGET} genassym.o: ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c offset.inc genassym.o: ${SRCS:Mopt_*.h} ${CC} -c ${CFLAGS:N-flto:N-fno-common} \ ${SYSDIR}/${MACHINE}/${MACHINE}/genassym.c offset.inc: ${SYSDIR}/kern/genoffset.sh genoffset.o sh ${SYSDIR}/kern/genoffset.sh genoffset.o > ${.TARGET} genoffset.o: ${SYSDIR}/kern/genoffset.c genoffset.o: ${SRCS:Mopt_*.h} ${CC} -c ${CFLAGS:N-flto:N-fno-common} \ ${SYSDIR}/kern/genoffset.c CLEANDEPENDFILES+= ${_ILINKS} # .depend needs include links so we remove them only together. cleanilinks: rm -f ${_ILINKS} OBJS_DEPEND_GUESS+= ${SRCS:M*.h} .if defined(KERNBUILDDIR) OBJS_DEPEND_GUESS+= opt_global.h .endif .include .include .include .include "kern.mk" Index: projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c =================================================================== --- projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c (revision 347598) +++ projects/runtime-coverage-v2/sys/dev/cpuctl/cpuctl.c (revision 347599) @@ -1,597 +1,598 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006-2008 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static d_open_t cpuctl_open; static d_ioctl_t cpuctl_ioctl; #define CPUCTL_VERSION 1 #ifdef CPUCTL_DEBUG # define DPRINTF(format,...) printf(format, __VA_ARGS__); #else # define DPRINTF(...) #endif #define UCODE_SIZE_MAX (4 * 1024 * 1024) static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td); static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td); static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data, struct thread *td); static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td); static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td); static int update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td); static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td); static int update_via(int cpu, cpuctl_update_args_t *args, struct thread *td); static struct cdev **cpuctl_devs; static MALLOC_DEFINE(M_CPUCTL, "cpuctl", "CPUCTL buffer"); static struct cdevsw cpuctl_cdevsw = { .d_version = D_VERSION, .d_open = cpuctl_open, .d_ioctl = cpuctl_ioctl, .d_name = "cpuctl", }; /* * This function checks if specified cpu enabled or not. */ static int cpu_enabled(int cpu) { return (pmc_cpu_is_disabled(cpu) == 0); } /* * Check if the current thread is bound to a specific cpu. */ static int cpu_sched_is_bound(struct thread *td) { int ret; thread_lock(td); ret = sched_is_bound(td); thread_unlock(td); return (ret); } /* * Switch to target cpu to run. */ static void set_cpu(int cpu, struct thread *td) { KASSERT(cpu >= 0 && cpu <= mp_maxid && cpu_enabled(cpu), ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); thread_lock(td); sched_bind(td, cpu); thread_unlock(td); KASSERT(td->td_oncpu == cpu, ("[cpuctl,%d]: cannot bind to target cpu %d on cpu %d", __LINE__, cpu, td->td_oncpu)); } static void restore_cpu(int oldcpu, int is_bound, struct thread *td) { KASSERT(oldcpu >= 0 && oldcpu <= mp_maxid && cpu_enabled(oldcpu), ("[cpuctl,%d]: bad cpu number %d", __LINE__, oldcpu)); thread_lock(td); if (is_bound == 0) sched_unbind(td); else sched_bind(td, oldcpu); thread_unlock(td); } int cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags, struct thread *td) { int cpu, ret; cpu = dev2unit(dev); if (cpu > mp_maxid || !cpu_enabled(cpu)) { DPRINTF("[cpuctl,%d]: bad cpu number %d\n", __LINE__, cpu); return (ENXIO); } /* Require write flag for "write" requests. */ if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT || cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR || cmd == CPUCTL_EVAL_CPU_FEATURES) && (flags & FWRITE) == 0) return (EPERM); switch (cmd) { case CPUCTL_RDMSR: ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td); break; case CPUCTL_MSRSBIT: case CPUCTL_MSRCBIT: case CPUCTL_WRMSR: ret = priv_check(td, PRIV_CPUCTL_WRMSR); if (ret != 0) goto fail; ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td); break; case CPUCTL_CPUID: ret = cpuctl_do_cpuid(cpu, (cpuctl_cpuid_args_t *)data, td); break; case CPUCTL_UPDATE: ret = priv_check(td, PRIV_CPUCTL_UPDATE); if (ret != 0) goto fail; ret = cpuctl_do_update(cpu, (cpuctl_update_args_t *)data, td); break; case CPUCTL_CPUID_COUNT: ret = cpuctl_do_cpuid_count(cpu, (cpuctl_cpuid_count_args_t *)data, td); break; case CPUCTL_EVAL_CPU_FEATURES: ret = cpuctl_do_eval_cpu_features(cpu, td); break; default: ret = EINVAL; break; } fail: return (ret); } /* * Actually perform cpuid operation. */ static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data, struct thread *td) { int is_bound = 0; int oldcpu; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); /* Explicitly clear cpuid data to avoid returning stale info. */ bzero(data->data, sizeof(data->data)); DPRINTF("[cpuctl,%d]: retrieving cpuid lev %#0x type %#0x for %d cpu\n", __LINE__, data->level, data->level_type, cpu); #ifdef __i386__ if (cpu_id == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); cpuid_count(data->level, data->level_type, data->data); restore_cpu(oldcpu, is_bound, td); return (0); } static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td) { cpuctl_cpuid_count_args_t cdata; int error; cdata.level = data->level; /* Override the level type. */ cdata.level_type = 0; error = cpuctl_do_cpuid_count(cpu, &cdata, td); bcopy(cdata.data, data->data, sizeof(data->data)); /* Ignore error */ return (error); } /* * Actually perform MSR operations. */ static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td) { uint64_t reg; int is_bound = 0; int oldcpu; int ret; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); /* * Explicitly clear cpuid data to avoid returning stale * info */ DPRINTF("[cpuctl,%d]: operating on MSR %#0x for %d cpu\n", __LINE__, data->msr, cpu); #ifdef __i386__ if ((cpu_feature & CPUID_MSR) == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); if (cmd == CPUCTL_RDMSR) { data->data = 0; ret = rdmsr_safe(data->msr, &data->data); } else if (cmd == CPUCTL_WRMSR) { ret = wrmsr_safe(data->msr, data->data); } else if (cmd == CPUCTL_MSRSBIT) { critical_enter(); ret = rdmsr_safe(data->msr, ®); if (ret == 0) ret = wrmsr_safe(data->msr, reg | data->data); critical_exit(); } else if (cmd == CPUCTL_MSRCBIT) { critical_enter(); ret = rdmsr_safe(data->msr, ®); if (ret == 0) ret = wrmsr_safe(data->msr, reg & ~data->data); critical_exit(); } else panic("[cpuctl,%d]: unknown operation requested: %lu", __LINE__, cmd); restore_cpu(oldcpu, is_bound, td); return (ret); } /* * Actually perform microcode update. */ static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td) { cpuctl_cpuid_args_t args = { .level = 0, }; char vendor[13]; int ret; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); DPRINTF("[cpuctl,%d]: XXX %d", __LINE__, cpu); ret = cpuctl_do_cpuid(cpu, &args, td); if (ret != 0) return (ret); ((uint32_t *)vendor)[0] = args.data[1]; ((uint32_t *)vendor)[1] = args.data[3]; ((uint32_t *)vendor)[2] = args.data[2]; vendor[12] = '\0'; if (strncmp(vendor, INTEL_VENDOR_ID, sizeof(INTEL_VENDOR_ID)) == 0) ret = update_intel(cpu, data, td); else if(strncmp(vendor, AMD_VENDOR_ID, sizeof(AMD_VENDOR_ID)) == 0) ret = update_amd(cpu, data, td); else if(strncmp(vendor, CENTAUR_VENDOR_ID, sizeof(CENTAUR_VENDOR_ID)) == 0) ret = update_via(cpu, data, td); else ret = ENXIO; return (ret); } struct ucode_update_data { void *ptr; int cpu; int ret; }; static void ucode_intel_load_rv(void *arg) { struct ucode_update_data *d; d = arg; if (PCPU_GET(cpuid) == d->cpu) d->ret = ucode_intel_load(d->ptr, true, NULL, NULL); } static int update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td) { struct ucode_update_data d; void *ptr; int is_bound, oldcpu, ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 16 byte alignment required. Rely on the fact that * malloc(9) always returns the pointer aligned at least on * the size of the allocation. */ ptr = malloc(args->size + 16, M_CPUCTL, M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto out; } oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); d.ptr = ptr; d.cpu = cpu; smp_rendezvous(NULL, ucode_intel_load_rv, NULL, &d); restore_cpu(oldcpu, is_bound, td); ret = d.ret; /* * Replace any existing update. This ensures that the new update * will be reloaded automatically during ACPI resume. */ if (ret == 0) ptr = ucode_update(ptr); out: free(ptr, M_CPUCTL); return (ret); } /* * NB: MSR 0xc0010020, MSR_K8_UCODE_UPDATE, is not documented by AMD. * Coreboot, illumos and Linux source code was used to understand * its workings. */ static void amd_ucode_wrmsr(void *ucode_ptr) { uint32_t tmp[4]; wrmsr_safe(MSR_K8_UCODE_UPDATE, (uintptr_t)ucode_ptr); do_cpuid(0, tmp); } static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td) { void *ptr; int ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 16 byte alignment required. Rely on the fact that * malloc(9) always returns the pointer aligned at least on * the size of the allocation. */ ptr = malloc(args->size + 16, M_CPUCTL, M_ZERO | M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto fail; } smp_rendezvous(NULL, amd_ucode_wrmsr, NULL, ptr); ret = 0; fail: free(ptr, M_CPUCTL); return (ret); } static int update_via(int cpu, cpuctl_update_args_t *args, struct thread *td) { void *ptr; uint64_t rev0, rev1, res; uint32_t tmp[4]; int is_bound; int oldcpu; int ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 4 byte alignment required. */ ptr = malloc(args->size, M_CPUCTL, M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto fail; } oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); critical_enter(); rdmsr_safe(MSR_BIOS_SIGN, &rev0); /* Get current microcode revision. */ /* * Perform update. */ wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uintptr_t)(ptr)); do_cpuid(1, tmp); /* * Result are in low byte of MSR FCR5: * 0x00: No update has been attempted since RESET. * 0x01: The last attempted update was successful. * 0x02: The last attempted update was unsuccessful due to a bad * environment. No update was loaded and any preexisting * patches are still active. * 0x03: The last attempted update was not applicable to this processor. * No update was loaded and any preexisting patches are still * active. * 0x04: The last attempted update was not successful due to an invalid * update data block. No update was loaded and any preexisting * patches are still active */ rdmsr_safe(0x1205, &res); res &= 0xff; critical_exit(); rdmsr_safe(MSR_BIOS_SIGN, &rev1); /* Get new microcode revision. */ restore_cpu(oldcpu, is_bound, td); DPRINTF("[cpu,%d]: rev0=%x rev1=%x res=%x\n", __LINE__, (unsigned)(rev0 >> 32), (unsigned)(rev1 >> 32), (unsigned)res); if (res != 0x01) ret = EINVAL; else ret = 0; fail: free(ptr, M_CPUCTL); return (ret); } static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td) { int is_bound = 0; int oldcpu; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); #ifdef __i386__ if (cpu_id == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); identify_cpu1(); identify_cpu2(); hw_ibrs_recalculate(); restore_cpu(oldcpu, is_bound, td); hw_ssb_recalculate(true); #ifdef __amd64__ amd64_syscall_ret_flush_l1d_recalc(); #endif + hw_mds_recalculate(); printcpuinfo(); return (0); } int cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td) { int ret = 0; int cpu; cpu = dev2unit(dev); if (cpu > mp_maxid || !cpu_enabled(cpu)) { DPRINTF("[cpuctl,%d]: incorrect cpu number %d\n", __LINE__, cpu); return (ENXIO); } if (flags & FWRITE) ret = securelevel_gt(td->td_ucred, 0); return (ret); } static int cpuctl_modevent(module_t mod __unused, int type, void *data __unused) { int cpu; switch(type) { case MOD_LOAD: if (bootverbose) printf("cpuctl: access to MSR registers/cpuid info.\n"); cpuctl_devs = malloc(sizeof(*cpuctl_devs) * (mp_maxid + 1), M_CPUCTL, M_WAITOK | M_ZERO); CPU_FOREACH(cpu) if (cpu_enabled(cpu)) cpuctl_devs[cpu] = make_dev(&cpuctl_cdevsw, cpu, UID_ROOT, GID_KMEM, 0640, "cpuctl%d", cpu); break; case MOD_UNLOAD: CPU_FOREACH(cpu) { if (cpuctl_devs[cpu] != NULL) destroy_dev(cpuctl_devs[cpu]); } free(cpuctl_devs, M_CPUCTL); break; case MOD_SHUTDOWN: break; default: return (EOPNOTSUPP); } return (0); } DEV_MODULE(cpuctl, cpuctl_modevent, NULL); MODULE_VERSION(cpuctl, CPUCTL_VERSION); Index: projects/runtime-coverage-v2/sys/i386/i386/exception.s =================================================================== --- projects/runtime-coverage-v2/sys/i386/i386/exception.s (revision 347598) +++ projects/runtime-coverage-v2/sys/i386/i386/exception.s (revision 347599) @@ -1,659 +1,661 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * Copyright (c) 2007, 2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_apic.h" #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" #include "assym.inc" #include #include #include #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr .align 4 .type dtrace_invop_jump_addr, @object .size dtrace_invop_jump_addr, 4 dtrace_invop_jump_addr: .zero 4 .globl dtrace_invop_calltrap_addr .align 4 .type dtrace_invop_calltrap_addr, @object .size dtrace_invop_calltrap_addr, 4 dtrace_invop_calltrap_addr: .zero 8 #endif .text ENTRY(start_exceptions) .globl tramp_idleptd tramp_idleptd: .long 0 /*****************************************************************************/ /* Trap handling */ /*****************************************************************************/ /* * Trap and fault vector routines. * * All traps are 'interrupt gates', SDT_SYS386IGT. Interrupts are disabled * by hardware to not allow interrupts until code switched to the kernel * address space and the kernel thread stack. * * The cpu will push a certain amount of state onto the kernel stack for * the current process. The amount of state depends on the type of trap * and whether the trap crossed rings or not. See i386/include/frame.h. * At the very least the current EFLAGS (status register, which includes * the interrupt disable state prior to the trap), the code segment register, * and the return instruction pointer are pushed by the cpu. The cpu * will also push an 'error' code for certain traps. We push a dummy * error code for those traps where the cpu doesn't in order to maintain * a consistent frame. We also push a contrived 'trap number'. * * The cpu does not push the general registers, we must do that, and we * must restore them prior to calling 'iret'. The cpu adjusts the %cs and * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we * must load them with appropriate values for supervisor mode operation. * * This code is not executed at the linked address, it is copied to the * trampoline area. As the consequence, all code there and in included files * must be PIC. */ MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) #define TRAP(a) pushl $(a) ; jmp alltraps IDTVEC(div) pushl $0; TRAP(T_DIVIDE) IDTVEC(bpt) pushl $0; TRAP(T_BPTFLT) IDTVEC(dtrace_ret) pushl $0; TRAP(T_DTRACE_RET) IDTVEC(ofl) pushl $0; TRAP(T_OFLOW) IDTVEC(bnd) pushl $0; TRAP(T_BOUND) #ifndef KDTRACE_HOOKS IDTVEC(ill) pushl $0; TRAP(T_PRIVINFLT) #endif IDTVEC(dna) pushl $0; TRAP(T_DNA) IDTVEC(fpusegm) pushl $0; TRAP(T_FPOPFLT) IDTVEC(tss) TRAP(T_TSSFLT) IDTVEC(missing) pushl $T_SEGNPFLT jmp irettraps IDTVEC(stk) pushl $T_STKFLT jmp irettraps IDTVEC(prot) pushl $T_PROTFLT jmp irettraps IDTVEC(page) testl $PSL_VM, TF_EFLAGS-TF_ERR(%esp) jnz 1f testb $SEL_RPL_MASK, TF_CS-TF_ERR(%esp) jnz 1f cmpl $PMAP_TRM_MIN_ADDRESS, TF_EIP-TF_ERR(%esp) jb 1f movl %ebx, %cr3 movl %edx, TF_EIP-TF_ERR(%esp) addl $4, %esp iret 1: pushl $T_PAGEFLT jmp alltraps IDTVEC(rsvd_pti) IDTVEC(rsvd) pushl $0; TRAP(T_RESERVED) IDTVEC(fpu) pushl $0; TRAP(T_ARITHTRAP) IDTVEC(align) TRAP(T_ALIGNFLT) IDTVEC(xmm) pushl $0; TRAP(T_XMMFLT) /* * All traps except ones for syscalls or invalid segment, * jump to alltraps. If * interrupts were enabled when the trap occurred, then interrupts * are enabled now if the trap was through a trap gate, else * disabled if the trap was through an interrupt gate. Note that * int0x80_syscall is a trap gate. Interrupt gates are used by * page faults, non-maskable interrupts, debug and breakpoint * exceptions. */ SUPERALIGN_TEXT .globl alltraps .type alltraps,@function alltraps: PUSH_FRAME2 alltraps_with_regs_pushed: SET_KERNEL_SREGS cld KENTER FAKE_MCOUNT(TF_EIP(%esp)) calltrap: pushl %esp movl $trap,%eax call *%eax add $4, %esp /* * Return via doreti to handle ASTs. */ MEXITCOUNT jmp doreti .globl irettraps .type irettraps,@function irettraps: testl $PSL_VM, TF_EFLAGS-TF_TRAPNO(%esp) jnz alltraps testb $SEL_RPL_MASK, TF_CS-TF_TRAPNO(%esp) jnz alltraps /* * Kernel mode. * The special case there is the kernel mode with user %cr3 and * trampoline stack. We need to copy both current frame and the * hardware portion of the frame we tried to return to, to the * normal stack. This logic must follow the stack unwind order * in doreti. */ PUSH_FRAME2 SET_KERNEL_SREGS cld call 1f 1: popl %ebx leal (doreti_iret - 1b)(%ebx), %edx cmpl %edx, TF_EIP(%esp) jne 2f movl $(2 * TF_SZ - TF_EIP), %ecx jmp 6f 2: leal (doreti_popl_ds - 1b)(%ebx), %edx cmpl %edx, TF_EIP(%esp) jne 3f movl $(2 * TF_SZ - TF_DS), %ecx jmp 6f 3: leal (doreti_popl_es - 1b)(%ebx), %edx cmpl %edx, TF_EIP(%esp) jne 4f movl $(2 * TF_SZ - TF_ES), %ecx jmp 6f 4: leal (doreti_popl_fs - 1b)(%ebx), %edx cmpl %edx, TF_EIP(%esp) jne 5f movl $(2 * TF_SZ - TF_FS), %ecx jmp 6f /* kernel mode, normal */ 5: FAKE_MCOUNT(TF_EIP(%esp)) jmp calltrap 6: cmpl $PMAP_TRM_MIN_ADDRESS, %esp /* trampoline stack ? */ jb 5b /* if not, no need to change stacks */ movl (tramp_idleptd - 1b)(%ebx), %eax movl %eax, %cr3 movl PCPU(KESP0), %edx subl %ecx, %edx movl %edx, %edi movl %esp, %esi rep; movsb movl %edx, %esp FAKE_MCOUNT(TF_EIP(%esp)) jmp calltrap /* * Privileged instruction fault. */ #ifdef KDTRACE_HOOKS SUPERALIGN_TEXT IDTVEC(ill) /* * Check if this is a user fault. If so, just handle it as a normal * trap. */ testl $PSL_VM, 8(%esp) /* and vm86 mode. */ jnz norm_ill cmpl $GSEL_KPL, 4(%esp) /* Check the code segment */ jne norm_ill /* * Check if a DTrace hook is registered. The trampoline cannot * be instrumented. */ cmpl $0, dtrace_invop_jump_addr je norm_ill /* * This is a kernel instruction fault that might have been caused * by a DTrace provider. */ pushal cld /* * Set our jump address for the jump back in the event that * the exception wasn't caused by DTrace at all. */ movl $norm_ill, dtrace_invop_calltrap_addr /* Jump to the code hooked in by DTrace. */ jmpl *dtrace_invop_jump_addr /* * Process the instruction fault in the normal way. */ norm_ill: pushl $0 pushl $T_PRIVINFLT jmp alltraps #endif /* * See comment in the handler for the kernel case T_TRCTRAP in trap.c. * The exception handler must be ready to execute with wrong %cr3. * We save original %cr3 in frame->tf_err, similarly to NMI and MCE * handlers. */ IDTVEC(dbg) pushl $0 pushl $T_TRCTRAP PUSH_FRAME2 SET_KERNEL_SREGS cld movl %cr3, %eax movl %eax, TF_ERR(%esp) call 1f 1: popl %eax movl (tramp_idleptd - 1b)(%eax), %eax movl %eax, %cr3 FAKE_MCOUNT(TF_EIP(%esp)) testl $PSL_VM, TF_EFLAGS(%esp) jnz dbg_user testb $SEL_RPL_MASK,TF_CS(%esp) jz calltrap dbg_user: NMOVE_STACKS movl $handle_ibrs_entry,%eax call *%eax pushl %esp movl $trap,%eax call *%eax add $4, %esp movl $T_RESERVED, TF_TRAPNO(%esp) MEXITCOUNT jmp doreti IDTVEC(mchk) pushl $0 pushl $T_MCHK jmp nmi_mchk_common IDTVEC(nmi) pushl $0 pushl $T_NMI nmi_mchk_common: PUSH_FRAME2 SET_KERNEL_SREGS cld /* * Save %cr3 into tf_err. There is no good place to put it. * Always reload %cr3, since we might have interrupted the * kernel entry or exit. * Do not switch to the thread kernel stack, otherwise we might * obliterate the previous context partially copied from the * trampoline stack. * Do not re-enable IBRS, there is no good place to store * previous state if we come from the kernel. */ movl %cr3, %eax movl %eax, TF_ERR(%esp) call 1f 1: popl %eax movl (tramp_idleptd - 1b)(%eax), %eax movl %eax, %cr3 FAKE_MCOUNT(TF_EIP(%esp)) jmp calltrap /* * Trap gate entry for syscalls (int 0x80). * This is used by FreeBSD ELF executables, "new" a.out executables, and all * Linux executables. * * Even though the name says 'int0x80', this is actually a trap gate, not an * interrupt gate. Thus interrupts are enabled on entry just as they are for * a normal syscall. */ SUPERALIGN_TEXT IDTVEC(int0x80_syscall) pushl $2 /* sizeof "int 0x80" */ pushl $0 /* tf_trapno */ PUSH_FRAME2 SET_KERNEL_SREGS cld MOVE_STACKS movl $handle_ibrs_entry,%eax call *%eax sti FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp movl $syscall, %eax call *%eax add $4, %esp MEXITCOUNT jmp doreti ENTRY(fork_trampoline) pushl %esp /* trapframe pointer */ pushl %ebx /* arg1 */ pushl %esi /* function */ movl $fork_exit, %eax call *%eax addl $12,%esp /* cut from syscall */ /* * Return via doreti to handle ASTs. */ MEXITCOUNT jmp doreti /* * To efficiently implement classification of trap and interrupt handlers * for profiling, there must be only trap handlers between the labels btrap * and bintr, and only interrupt handlers between the labels bintr and * eintr. This is implemented (partly) by including files that contain * some of the handlers. Before including the files, set up a normal asm * environment so that the included files doen't need to know that they are * included. */ .data .p2align 4 .text SUPERALIGN_TEXT MCOUNT_LABEL(bintr) #ifdef DEV_ATPIC #include #endif #if defined(DEV_APIC) && defined(DEV_ATPIC) .data .p2align 4 .text SUPERALIGN_TEXT #endif #ifdef DEV_APIC #include #endif .data .p2align 4 .text SUPERALIGN_TEXT #include .text MCOUNT_LABEL(eintr) #include /* * void doreti(struct trapframe) * * Handle return from interrupts, traps and syscalls. */ .text SUPERALIGN_TEXT .type doreti,@function .globl doreti doreti: FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ doreti_next: /* * Check if ASTs can be handled now. ASTs cannot be safely * processed when returning from an NMI. */ cmpb $T_NMI,TF_TRAPNO(%esp) #ifdef HWPMC_HOOKS je doreti_nmi #else je doreti_exit #endif /* * PSL_VM must be checked first since segment registers only * have an RPL in non-VM86 mode. * ASTs can not be handled now if we are in a vm86 call. */ testl $PSL_VM,TF_EFLAGS(%esp) jz doreti_notvm86 movl PCPU(CURPCB),%ecx testl $PCB_VM86CALL,PCB_FLAGS(%ecx) jz doreti_ast jmp doreti_popl_fs doreti_notvm86: testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ jz doreti_exit /* can't handle ASTs now if not */ doreti_ast: /* * Check for ASTs atomically with returning. Disabling CPU * interrupts provides sufficient locking even in the SMP case, * since we will be informed of any new ASTs by an IPI. */ cli movl PCPU(CURTHREAD),%eax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) je doreti_exit sti pushl %esp /* pass a pointer to the trapframe */ movl $ast, %eax call *%eax add $4,%esp jmp doreti_ast /* * doreti_exit: pop registers, iret. * * The segment register pop is a special case, since it may * fault if (for example) a sigreturn specifies bad segment * registers. The fault is handled in trap.c. */ doreti_exit: MEXITCOUNT cmpl $T_NMI, TF_TRAPNO(%esp) je doreti_iret_nmi cmpl $T_MCHK, TF_TRAPNO(%esp) je doreti_iret_nmi cmpl $T_TRCTRAP, TF_TRAPNO(%esp) je doreti_iret_nmi movl $TF_SZ, %ecx testl $PSL_VM,TF_EFLAGS(%esp) jz 1f /* PCB_VM86CALL is not set */ addl $VM86_STACK_SPACE, %ecx jmp 2f 1: testl $SEL_RPL_MASK, TF_CS(%esp) jz doreti_popl_fs 2: movl $handle_ibrs_exit,%eax pushl %ecx /* preserve enough call-used regs */ call *%eax + movl mds_handler,%eax + call *%eax popl %ecx movl %esp, %esi movl PCPU(TRAMPSTK), %edx subl %ecx, %edx movl %edx, %edi rep; movsb movl %edx, %esp movl PCPU(CURPCB),%eax movl PCB_CR3(%eax), %eax movl %eax, %cr3 .globl doreti_popl_fs doreti_popl_fs: popl %fs .globl doreti_popl_es doreti_popl_es: popl %es .globl doreti_popl_ds doreti_popl_ds: popl %ds popal addl $8,%esp .globl doreti_iret doreti_iret: iret doreti_iret_nmi: movl TF_ERR(%esp), %eax movl %eax, %cr3 jmp doreti_popl_fs /* * doreti_iret_fault and friends. Alternative return code for * the case where we get a fault in the doreti_exit code * above. trap() (i386/i386/trap.c) catches this specific * case, and continues in the corresponding place in the code * below. * * If the fault occured during return to usermode, we recreate * the trap frame and call trap() to send a signal. Otherwise * the kernel was tricked into fault by attempt to restore invalid * usermode segment selectors on return from nested fault or * interrupt, where interrupted kernel entry code not yet loaded * kernel selectors. In the latter case, emulate iret and zero * the invalid selector. */ ALIGN_TEXT .globl doreti_iret_fault doreti_iret_fault: pushl $0 /* tf_err */ pushl $0 /* tf_trapno XXXKIB: provide more useful value ? */ pushal pushl $0 movw %ds,(%esp) .globl doreti_popl_ds_fault doreti_popl_ds_fault: testb $SEL_RPL_MASK,TF_CS-TF_DS(%esp) jz doreti_popl_ds_kfault pushl $0 movw %es,(%esp) .globl doreti_popl_es_fault doreti_popl_es_fault: testb $SEL_RPL_MASK,TF_CS-TF_ES(%esp) jz doreti_popl_es_kfault pushl $0 movw %fs,(%esp) .globl doreti_popl_fs_fault doreti_popl_fs_fault: testb $SEL_RPL_MASK,TF_CS-TF_FS(%esp) jz doreti_popl_fs_kfault movl $0,TF_ERR(%esp) /* XXX should be the error code */ movl $T_PROTFLT,TF_TRAPNO(%esp) SET_KERNEL_SREGS jmp calltrap doreti_popl_ds_kfault: movl $0,(%esp) jmp doreti_popl_ds doreti_popl_es_kfault: movl $0,(%esp) jmp doreti_popl_es doreti_popl_fs_kfault: movl $0,(%esp) jmp doreti_popl_fs #ifdef HWPMC_HOOKS doreti_nmi: /* * Since we are returning from an NMI, check if the current trap * was from user mode and if so whether the current thread * needs a user call chain capture. */ testl $PSL_VM, TF_EFLAGS(%esp) jnz doreti_exit testb $SEL_RPL_MASK,TF_CS(%esp) jz doreti_exit movl PCPU(CURTHREAD),%eax /* curthread present? */ orl %eax,%eax jz doreti_exit testl $TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */ jz doreti_exit /* * Switch to thread stack. Reset tf_trapno to not indicate NMI, * to cause normal userspace exit. */ movl $T_RESERVED, TF_TRAPNO(%esp) NMOVE_STACKS /* * Take the processor out of NMI mode by executing a fake "iret". */ pushfl pushl %cs call 1f 1: popl %eax leal (outofnmi-1b)(%eax),%eax pushl %eax iret outofnmi: /* * Call the callchain capture hook after turning interrupts back on. */ movl pmc_hook,%ecx orl %ecx,%ecx jz doreti_exit pushl %esp /* frame pointer */ pushl $PMC_FN_USER_CALLCHAIN /* command */ movl PCPU(CURTHREAD),%eax pushl %eax /* curthread */ sti call *%ecx addl $12,%esp jmp doreti_ast #endif ENTRY(end_exceptions) Index: projects/runtime-coverage-v2/sys/i386/i386/genassym.c =================================================================== --- projects/runtime-coverage-v2/sys/i386/i386/genassym.c (revision 347598) +++ projects/runtime-coverage-v2/sys/i386/i386/genassym.c (revision 347599) @@ -1,234 +1,237 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #endif #include #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TD0_KSTACK_PAGES, TD0_KSTACK_PAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0)); ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2)); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds)); ASSYM(PCB_ES, offsetof(struct pcb, pcb_es)); ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs)); ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_EXT_TSS, offsetof(struct pcb_ext, ext_tss)); ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd)); ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd)); ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_VM86CALL, PCB_VM86CALL); ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt)); ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt)); ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(TF_FS, offsetof(struct trapframe, tf_fs)); ASSYM(TF_ES, offsetof(struct trapframe, tf_es)); ASSYM(TF_DS, offsetof(struct trapframe, tf_ds)); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(TF_SZ, sizeof(struct trapframe)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); #ifdef COMPAT_43 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); #endif ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); #ifdef COMPAT_FREEBSD4 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc)); #endif #ifdef COMPAT_43 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); #endif #ifdef COMPAT_FREEBSD4 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags)); ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs)); #endif ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_COMMON_TSSP, offsetof(struct pcpu, pc_common_tssp)); ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd)); ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt)); ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt)); ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss)); ASSYM(PC_KESP0, offsetof(struct pcpu, pc_kesp0)); ASSYM(PC_TRAMPSTK, offsetof(struct pcpu, pc_trampstk)); ASSYM(PC_COPYOUT_BUF, offsetof(struct pcpu, pc_copyout_buf)); ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set)); +ASSYM(PC_MDS_TMP, offsetof(struct pcpu, pc_mds_tmp)); +ASSYM(PC_MDS_BUF, offsetof(struct pcpu, pc_mds_buf)); +ASSYM(PC_MDS_BUF64, offsetof(struct pcpu, pc_mds_buf64)); ASSYM(PMAP_TRM_MIN_ADDRESS, PMAP_TRM_MIN_ADDRESS); ASSYM(KERNLOAD, KERNLOAD); ASSYM(KERNBASE, KERNBASE); #ifdef DEV_APIC ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); ASSYM(VM86_STACK_SPACE, VM86_STACK_SPACE); ASSYM(TRAMP_COPYOUT_SZ, TRAMP_COPYOUT_SZ); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif Index: projects/runtime-coverage-v2/sys/i386/i386/initcpu.c =================================================================== --- projects/runtime-coverage-v2/sys/i386/i386/initcpu.c (revision 347598) +++ projects/runtime-coverage-v2/sys/i386/i386/initcpu.c (revision 347599) @@ -1,983 +1,984 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) KATO Takenori, 1997, 1998. * * All rights reserved. Unpublished rights reserved under the copyright * laws of Japan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #ifdef I486_CPU static void init_5x86(void); static void init_bluelightning(void); static void init_486dlc(void); static void init_cy486dx(void); #ifdef CPU_I486_ON_386 static void init_i486_on_386(void); #endif static void init_6x86(void); #endif /* I486_CPU */ #if defined(I586_CPU) && defined(CPU_WT_ALLOC) static void enable_K5_wt_alloc(void); static void enable_K6_wt_alloc(void); static void enable_K6_2_wt_alloc(void); #endif #ifdef I686_CPU static void init_6x86MX(void); static void init_ppro(void); static void init_mendocino(void); #endif static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); /* * -1: automatic (default) * 0: keep enable CLFLUSH * 1: force disable CLFLUSH */ static int hw_clflush_disable = -1; u_int cyrix_did; /* Device ID of Cyrix CPU */ #ifdef I486_CPU /* * IBM Blue Lightning */ static void init_bluelightning(void) { register_t saveintr; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); invd(); #ifdef CPU_BLUELIGHTNING_FPU_OP_CACHE wrmsr(0x1000, 0x9c92LL); /* FP operand can be cacheable on Cyrix FPU */ #else wrmsr(0x1000, 0x1c92LL); /* Intel FPU */ #endif /* Enables 13MB and 0-640KB cache. */ wrmsr(0x1001, (0xd0LL << 32) | 0x3ff); #ifdef CPU_BLUELIGHTNING_3X wrmsr(0x1002, 0x04000000LL); /* Enables triple-clock mode. */ #else wrmsr(0x1002, 0x03000000LL); /* Enables double-clock mode. */ #endif /* Enable caching in CR0. */ load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ invd(); intr_restore(saveintr); } /* * Cyrix 486SLC/DLC/SR/DR series */ static void init_486dlc(void) { register_t saveintr; u_char ccr0; saveintr = intr_disable(); invd(); ccr0 = read_cyrix_reg(CCR0); #ifndef CYRIX_CACHE_WORKS ccr0 |= CCR0_NC1 | CCR0_BARB; write_cyrix_reg(CCR0, ccr0); invd(); #else ccr0 &= ~CCR0_NC0; #ifndef CYRIX_CACHE_REALLY_WORKS ccr0 |= CCR0_NC1 | CCR0_BARB; #else ccr0 |= CCR0_NC1; #endif #ifdef CPU_DIRECT_MAPPED_CACHE ccr0 |= CCR0_CO; /* Direct mapped mode. */ #endif write_cyrix_reg(CCR0, ccr0); /* Clear non-cacheable region. */ write_cyrix_reg(NCR1+2, NCR_SIZE_0K); write_cyrix_reg(NCR2+2, NCR_SIZE_0K); write_cyrix_reg(NCR3+2, NCR_SIZE_0K); write_cyrix_reg(NCR4+2, NCR_SIZE_0K); write_cyrix_reg(0, 0); /* dummy write */ /* Enable caching in CR0. */ load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ invd(); #endif /* !CYRIX_CACHE_WORKS */ intr_restore(saveintr); } /* * Cyrix 486S/DX series */ static void init_cy486dx(void) { register_t saveintr; u_char ccr2; saveintr = intr_disable(); invd(); ccr2 = read_cyrix_reg(CCR2); #ifdef CPU_SUSP_HLT ccr2 |= CCR2_SUSP_HLT; #endif write_cyrix_reg(CCR2, ccr2); intr_restore(saveintr); } /* * Cyrix 5x86 */ static void init_5x86(void) { register_t saveintr; u_char ccr2, ccr3, ccr4, pcr0; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); (void)read_cyrix_reg(CCR3); /* dummy */ /* Initialize CCR2. */ ccr2 = read_cyrix_reg(CCR2); ccr2 |= CCR2_WB; #ifdef CPU_SUSP_HLT ccr2 |= CCR2_SUSP_HLT; #else ccr2 &= ~CCR2_SUSP_HLT; #endif ccr2 |= CCR2_WT1; write_cyrix_reg(CCR2, ccr2); /* Initialize CCR4. */ ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); ccr4 = read_cyrix_reg(CCR4); ccr4 |= CCR4_DTE; ccr4 |= CCR4_MEM; #ifdef CPU_FASTER_5X86_FPU ccr4 |= CCR4_FASTFPE; #else ccr4 &= ~CCR4_FASTFPE; #endif ccr4 &= ~CCR4_IOMASK; /******************************************************************** * WARNING: The "BIOS Writers Guide" mentions that I/O recovery time * should be 0 for errata fix. ********************************************************************/ #ifdef CPU_IORT ccr4 |= CPU_IORT & CCR4_IOMASK; #endif write_cyrix_reg(CCR4, ccr4); /* Initialize PCR0. */ /**************************************************************** * WARNING: RSTK_EN and LOOP_EN could make your system unstable. * BTB_EN might make your system unstable. ****************************************************************/ pcr0 = read_cyrix_reg(PCR0); #ifdef CPU_RSTK_EN pcr0 |= PCR0_RSTK; #else pcr0 &= ~PCR0_RSTK; #endif #ifdef CPU_BTB_EN pcr0 |= PCR0_BTB; #else pcr0 &= ~PCR0_BTB; #endif #ifdef CPU_LOOP_EN pcr0 |= PCR0_LOOP; #else pcr0 &= ~PCR0_LOOP; #endif /**************************************************************** * WARNING: if you use a memory mapped I/O device, don't use * DISABLE_5X86_LSSER option, which may reorder memory mapped * I/O access. * IF YOUR MOTHERBOARD HAS PCI BUS, DON'T DISABLE LSSER. ****************************************************************/ #ifdef CPU_DISABLE_5X86_LSSER pcr0 &= ~PCR0_LSSER; #else pcr0 |= PCR0_LSSER; #endif write_cyrix_reg(PCR0, pcr0); /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); (void)read_cyrix_reg(0x80); /* dummy */ /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0, NW = 1 */ /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } #ifdef CPU_I486_ON_386 /* * There are i486 based upgrade products for i386 machines. * In this case, BIOS doesn't enable CPU cache. */ static void init_i486_on_386(void) { register_t saveintr; saveintr = intr_disable(); load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0, NW = 0 */ intr_restore(saveintr); } #endif /* * Cyrix 6x86 * * XXX - What should I do here? Please let me know. */ static void init_6x86(void) { register_t saveintr; u_char ccr3, ccr4; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); /* Initialize CCR0. */ write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); /* Initialize CCR1. */ #ifdef CPU_CYRIX_NO_LOCK write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); #else write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); #endif /* Initialize CCR2. */ #ifdef CPU_SUSP_HLT write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); #else write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); #endif ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); /* Initialize CCR4. */ ccr4 = read_cyrix_reg(CCR4); ccr4 |= CCR4_DTE; ccr4 &= ~CCR4_IOMASK; #ifdef CPU_IORT write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); #else write_cyrix_reg(CCR4, ccr4 | 7); #endif /* Initialize CCR5. */ #ifdef CPU_WT_ALLOC write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); #endif /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); /* * Earlier revision of the 6x86 CPU could crash the system if * L1 cache is in write-back mode. */ if ((cyrix_did & 0xff00) > 0x1600) load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ else { /* Revision 2.6 and lower. */ #ifdef CYRIX_CACHE_REALLY_WORKS load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ #else load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0 and NW = 1 */ #endif } /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } #endif /* I486_CPU */ #ifdef I586_CPU /* * Rise mP6 */ static void init_rise(void) { /* * The CMPXCHG8B instruction is always available but hidden. */ cpu_feature |= CPUID_CX8; } /* * IDT WinChip C6/2/2A/2B/3 * * http://www.centtech.com/winchip_bios_writers_guide_v4_0.pdf */ static void init_winchip(void) { u_int regs[4]; uint64_t fcr; fcr = rdmsr(0x0107); /* * Set ECX8, DSMC, DTLOCK/EDCTLB, EMMX, and ERETSTK and clear DPDC. */ fcr |= (1 << 1) | (1 << 7) | (1 << 8) | (1 << 9) | (1 << 16); fcr &= ~(1ULL << 11); /* * Additionally, set EBRPRED, E2MMX and EAMD3D for WinChip 2 and 3. */ if (CPUID_TO_MODEL(cpu_id) >= 8) fcr |= (1 << 12) | (1 << 19) | (1 << 20); wrmsr(0x0107, fcr); do_cpuid(1, regs); cpu_feature = regs[3]; } #endif #ifdef I686_CPU /* * Cyrix 6x86MX (code-named M2) * * XXX - What should I do here? Please let me know. */ static void init_6x86MX(void) { register_t saveintr; u_char ccr3, ccr4; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); /* Initialize CCR0. */ write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); /* Initialize CCR1. */ #ifdef CPU_CYRIX_NO_LOCK write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); #else write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); #endif /* Initialize CCR2. */ #ifdef CPU_SUSP_HLT write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); #else write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); #endif ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); /* Initialize CCR4. */ ccr4 = read_cyrix_reg(CCR4); ccr4 &= ~CCR4_IOMASK; #ifdef CPU_IORT write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); #else write_cyrix_reg(CCR4, ccr4 | 7); #endif /* Initialize CCR5. */ #ifdef CPU_WT_ALLOC write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); #endif /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } static int ppro_apic_used = -1; static void init_ppro(void) { u_int64_t apicbase; /* * Local APIC should be disabled if it is not going to be used. */ if (ppro_apic_used != 1) { apicbase = rdmsr(MSR_APICBASE); apicbase &= ~APICBASE_ENABLED; wrmsr(MSR_APICBASE, apicbase); ppro_apic_used = 0; } } /* * If the local APIC is going to be used after being disabled above, * re-enable it and don't disable it in the future. */ void ppro_reenable_apic(void) { u_int64_t apicbase; if (ppro_apic_used == 0) { apicbase = rdmsr(MSR_APICBASE); apicbase |= APICBASE_ENABLED; wrmsr(MSR_APICBASE, apicbase); ppro_apic_used = 1; } } /* * Initialize BBL_CR_CTL3 (Control register 3: used to configure the * L2 cache). */ static void init_mendocino(void) { #ifdef CPU_PPRO2CELERON register_t saveintr; u_int64_t bbl_cr_ctl3; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); bbl_cr_ctl3 = rdmsr(MSR_BBL_CR_CTL3); /* If the L2 cache is configured, do nothing. */ if (!(bbl_cr_ctl3 & 1)) { bbl_cr_ctl3 = 0x134052bLL; /* Set L2 Cache Latency (Default: 5). */ #ifdef CPU_CELERON_L2_LATENCY #if CPU_L2_LATENCY > 15 #error invalid CPU_L2_LATENCY. #endif bbl_cr_ctl3 |= CPU_L2_LATENCY << 1; #else bbl_cr_ctl3 |= 5 << 1; #endif wrmsr(MSR_BBL_CR_CTL3, bbl_cr_ctl3); } load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); intr_restore(saveintr); #endif /* CPU_PPRO2CELERON */ } /* * Initialize special VIA features */ static void init_via(void) { u_int regs[4], val; uint64_t fcr; /* * Explicitly enable CX8 and PGE on C3. * * http://www.via.com.tw/download/mainboards/6/13/VIA_C3_EBGA%20datasheet110.pdf */ if (CPUID_TO_MODEL(cpu_id) <= 9) fcr = (1 << 1) | (1 << 7); else fcr = 0; /* * Check extended CPUID for PadLock features. * * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf */ do_cpuid(0xc0000000, regs); if (regs[0] >= 0xc0000001) { do_cpuid(0xc0000001, regs); val = regs[3]; } else val = 0; /* Enable RNG if present. */ if ((val & VIA_CPUID_HAS_RNG) != 0) { via_feature_rng = VIA_HAS_RNG; wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG); } /* Enable PadLock if present. */ if ((val & VIA_CPUID_HAS_ACE) != 0) via_feature_xcrypt |= VIA_HAS_AES; if ((val & VIA_CPUID_HAS_ACE2) != 0) via_feature_xcrypt |= VIA_HAS_AESCTR; if ((val & VIA_CPUID_HAS_PHE) != 0) via_feature_xcrypt |= VIA_HAS_SHA; if ((val & VIA_CPUID_HAS_PMM) != 0) via_feature_xcrypt |= VIA_HAS_MM; if (via_feature_xcrypt != 0) fcr |= 1 << 28; wrmsr(0x1107, rdmsr(0x1107) | fcr); } #endif /* I686_CPU */ #if defined(I586_CPU) || defined(I686_CPU) static void init_transmeta(void) { u_int regs[0]; /* Expose all hidden features. */ wrmsr(0x80860004, rdmsr(0x80860004) | ~0UL); do_cpuid(1, regs); cpu_feature = regs[3]; } #endif extern int elf32_nxstack; void initializecpu(void) { uint64_t msr; switch (cpu) { #ifdef I486_CPU case CPU_BLUE: init_bluelightning(); break; case CPU_486DLC: init_486dlc(); break; case CPU_CY486DX: init_cy486dx(); break; case CPU_M1SC: init_5x86(); break; #ifdef CPU_I486_ON_386 case CPU_486: init_i486_on_386(); break; #endif case CPU_M1: init_6x86(); break; #endif /* I486_CPU */ #ifdef I586_CPU case CPU_586: switch (cpu_vendor_id) { case CPU_VENDOR_AMD: #ifdef CPU_WT_ALLOC if (((cpu_id & 0x0f0) > 0) && ((cpu_id & 0x0f0) < 0x60) && ((cpu_id & 0x00f) > 3)) enable_K5_wt_alloc(); else if (((cpu_id & 0x0f0) > 0x80) || (((cpu_id & 0x0f0) == 0x80) && (cpu_id & 0x00f) > 0x07)) enable_K6_2_wt_alloc(); else if ((cpu_id & 0x0f0) > 0x50) enable_K6_wt_alloc(); #endif if ((cpu_id & 0xf0) == 0xa0) /* * Make sure the TSC runs through * suspension, otherwise we can't use * it as timecounter */ wrmsr(0x1900, rdmsr(0x1900) | 0x20ULL); break; case CPU_VENDOR_CENTAUR: init_winchip(); break; case CPU_VENDOR_TRANSMETA: init_transmeta(); break; case CPU_VENDOR_RISE: init_rise(); break; } break; #endif #ifdef I686_CPU case CPU_M2: init_6x86MX(); break; case CPU_686: switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: switch (cpu_id & 0xff0) { case 0x610: init_ppro(); break; case 0x660: init_mendocino(); break; } break; #ifdef CPU_ATHLON_SSE_HACK case CPU_VENDOR_AMD: /* * Sometimes the BIOS doesn't enable SSE instructions. * According to AMD document 20734, the mobile * Duron, the (mobile) Athlon 4 and the Athlon MP * support SSE. These correspond to cpu_id 0x66X * or 0x67X. */ if ((cpu_feature & CPUID_XMM) == 0 && ((cpu_id & ~0xf) == 0x660 || (cpu_id & ~0xf) == 0x670 || (cpu_id & ~0xf) == 0x680)) { u_int regs[4]; wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000); do_cpuid(1, regs); cpu_feature = regs[3]; } break; #endif case CPU_VENDOR_CENTAUR: init_via(); break; case CPU_VENDOR_TRANSMETA: init_transmeta(); break; } break; #endif default: break; } if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { load_cr4(rcr4() | CR4_FXSR | CR4_XMM); cpu_fxsr = hw_instruction_sse = 1; } if (elf32_nxstack) { msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); } + hw_mds_recalculate(); if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid)); } void initializecpucache(void) { /* * CPUID with %eax = 1, %ebx returns * Bits 15-8: CLFLUSH line size * (Value * 8 = cache line size in bytes) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; /* * XXXKIB: (temporary) hack to work around traps generated * when CLFLUSHing APIC register window under virtualization * environments. These environments tend to disable the * CPUID_SS feature even though the native CPU supports it. */ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } /* * The kernel's use of CLFLUSH{,OPT} can be disabled manually * by setting the hw.clflush_disable tunable. */ if (hw_clflush_disable == 1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } } #if defined(I586_CPU) && defined(CPU_WT_ALLOC) /* * Enable write allocate feature of AMD processors. * Following two functions require the Maxmem variable being set. */ static void enable_K5_wt_alloc(void) { u_int64_t msr; register_t saveintr; /* * Write allocate is supported only on models 1, 2, and 3, with * a stepping of 4 or greater. */ if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { saveintr = intr_disable(); msr = rdmsr(0x83); /* HWCR */ wrmsr(0x83, msr & !(0x10)); /* * We have to tell the chip where the top of memory is, * since video cards could have frame bufferes there, * memory-mapped I/O could be there, etc. */ if(Maxmem > 0) msr = Maxmem / 16; else msr = 0; msr |= AMD_WT_ALLOC_TME | AMD_WT_ALLOC_FRE; /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ wrmsr(0x86, 0x0ff00f0); msr |= AMD_WT_ALLOC_PRE; wrmsr(0x85, msr); msr=rdmsr(0x83); wrmsr(0x83, msr|0x10); /* enable write allocate */ intr_restore(saveintr); } } static void enable_K6_wt_alloc(void) { quad_t size; u_int64_t whcr; register_t saveintr; saveintr = intr_disable(); wbinvd(); #ifdef CPU_DISABLE_CACHE /* * Certain K6-2 box becomes unstable when write allocation is * enabled. */ /* * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. * All other bits in TR12 have no effect on the processer's operation. * The I/O Trap Restart function (bit 9 of TR12) is always enabled * on the AMD-K6. */ wrmsr(0x0000000e, (u_int64_t)0x0008); #endif /* Don't assume that memory size is aligned with 4M. */ if (Maxmem > 0) size = ((Maxmem >> 8) + 3) >> 2; else size = 0; /* Limit is 508M bytes. */ if (size > 0x7f) size = 0x7f; whcr = (rdmsr(0xc0000082) & ~(0x7fLL << 1)) | (size << 1); #if defined(NO_MEMORY_HOLE) if (whcr & (0x7fLL << 1)) whcr |= 0x0001LL; #else /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ whcr &= ~0x0001LL; #endif wrmsr(0x0c0000082, whcr); intr_restore(saveintr); } static void enable_K6_2_wt_alloc(void) { quad_t size; u_int64_t whcr; register_t saveintr; saveintr = intr_disable(); wbinvd(); #ifdef CPU_DISABLE_CACHE /* * Certain K6-2 box becomes unstable when write allocation is * enabled. */ /* * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. * All other bits in TR12 have no effect on the processer's operation. * The I/O Trap Restart function (bit 9 of TR12) is always enabled * on the AMD-K6. */ wrmsr(0x0000000e, (u_int64_t)0x0008); #endif /* Don't assume that memory size is aligned with 4M. */ if (Maxmem > 0) size = ((Maxmem >> 8) + 3) >> 2; else size = 0; /* Limit is 4092M bytes. */ if (size > 0x3fff) size = 0x3ff; whcr = (rdmsr(0xc0000082) & ~(0x3ffLL << 22)) | (size << 22); #if defined(NO_MEMORY_HOLE) if (whcr & (0x3ffLL << 22)) whcr |= 1LL << 16; #else /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ whcr &= ~(1LL << 16); #endif wrmsr(0x0c0000082, whcr); intr_restore(saveintr); } #endif /* I585_CPU && CPU_WT_ALLOC */ #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(cyrixreg, cyrixreg) { register_t saveintr; u_int cr0; u_char ccr1, ccr2, ccr3; u_char ccr0 = 0, ccr4 = 0, ccr5 = 0, pcr0 = 0; cr0 = rcr0(); if (cpu_vendor_id == CPU_VENDOR_CYRIX) { saveintr = intr_disable(); if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) { ccr0 = read_cyrix_reg(CCR0); } ccr1 = read_cyrix_reg(CCR1); ccr2 = read_cyrix_reg(CCR2); ccr3 = read_cyrix_reg(CCR3); if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { write_cyrix_reg(CCR3, CCR3_MAPEN0); ccr4 = read_cyrix_reg(CCR4); if ((cpu == CPU_M1) || (cpu == CPU_M2)) ccr5 = read_cyrix_reg(CCR5); else pcr0 = read_cyrix_reg(PCR0); write_cyrix_reg(CCR3, ccr3); /* Restore CCR3. */ } intr_restore(saveintr); if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) printf("CCR0=%x, ", (u_int)ccr0); printf("CCR1=%x, CCR2=%x, CCR3=%x", (u_int)ccr1, (u_int)ccr2, (u_int)ccr3); if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { printf(", CCR4=%x, ", (u_int)ccr4); if (cpu == CPU_M1SC) printf("PCR0=%x\n", pcr0); else printf("CCR5=%x\n", ccr5); } } printf("CR0=%x\n", cr0); } #endif /* DDB */ Index: projects/runtime-coverage-v2/sys/i386/i386/support.s =================================================================== --- projects/runtime-coverage-v2/sys/i386/i386/support.s (revision 347598) +++ projects/runtime-coverage-v2/sys/i386/i386/support.s (revision 347599) @@ -1,474 +1,658 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include "assym.inc" #define IDXSHIFT 10 .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret END(bzero) ENTRY(sse2_pagezero) pushl %ebx movl 8(%esp),%ecx movl %ecx,%eax addl $4096,%eax xor %ebx,%ebx jmp 1f /* * The loop takes 14 bytes. Ensure that it doesn't cross a 16-byte * cache line. */ .p2align 4,0x90 1: movnti %ebx,(%ecx) movnti %ebx,4(%ecx) addl $8,%ecx cmpl %ecx,%eax jne 1b sfence popl %ebx ret END(sse2_pagezero) ENTRY(i686_pagezero) pushl %edi pushl %ebx movl 12(%esp),%edi movl $1024,%ecx ALIGN_TEXT 1: xorl %eax,%eax repe scasl jnz 2f popl %ebx popl %edi ret ALIGN_TEXT 2: incl %ecx subl $4,%edi movl %ecx,%edx cmpl $16,%ecx jge 3f movl %edi,%ebx andl $0x3f,%ebx shrl %ebx shrl %ebx movl $16,%ecx subl %ebx,%ecx 3: subl %ecx,%edx rep stosl movl %edx,%ecx testl %edx,%edx jnz 1b popl %ebx popl %edi ret END(i686_pagezero) /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx rep stosw popl %edi ret END(fillw) /* * memmove(dst, src, cnt) (return dst) * bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(bcopy) movl 4(%esp),%eax movl 8(%esp),%edx movl %eax,8(%esp) movl %edx,4(%esp) MEXITCOUNT jmp memmove END(bcopy) ENTRY(memmove) pushl %ebp movl %esp,%ebp pushl %esi pushl %edi movl 8(%ebp),%edi movl 12(%ebp),%esi 1: movl 16(%ebp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ rep movsl movl 16(%ebp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi movl 8(%ebp),%eax /* return dst for memmove */ popl %ebp ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 16(%ebp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld movl 8(%ebp),%eax /* return dst for memmove */ popl %ebp ret END(memmove) /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret END(memcpy) /* * copystr(from, to, maxlen, int *lencopied) - MP SAFE */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret END(copystr) ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx movl %edx,%ecx shrl $2,%ecx repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb 1: setne %al movsbl %al,%eax popl %esi popl %edi ret END(bcmp) /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %eax,%ds movl %eax,%es movl %eax,%gs movl %eax,%ss movl $KPSEL,%eax movl %eax,%fs /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) MEXITCOUNT lret END(lgdt) /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret END(ssdtosd) /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax movl %eax,%dr7 /* disable all breakpoints first */ movl %eax,%dr0 movl %eax,%dr1 movl %eax,%dr2 movl %eax,%dr3 movl %eax,%dr6 ret END(reset_dbregs) /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret END(setjmp) ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret END(longjmp) /* * Support for reading MSRs in the safe manner. (Instead of panic on #gp, * return an error.) */ ENTRY(rdmsr_safe) /* int rdmsr_safe(u_int msr, uint64_t *data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx rdmsr movl 8(%esp),%ecx movl %eax,(%ecx) movl %edx,4(%ecx) xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * Support for writing MSRs in the safe manner. (Instead of panic on #gp, * return an error.) */ ENTRY(wrmsr_safe) /* int wrmsr_safe(u_int msr, uint64_t data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx movl 8(%esp),%eax movl 12(%esp),%edx wrmsr xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * MSR operations fault handler */ ALIGN_TEXT msr_onfault: movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl $EFAULT,%eax ret ENTRY(handle_ibrs_entry) cmpb $0,hw_ibrs_active je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx wrmsr movb $1,PCPU(IBPB_SET) /* * i386 does not implement SMEP, but the 4/4 split makes this not * that important. */ 1: ret END(handle_ibrs_entry) ENTRY(handle_ibrs_exit) cmpb $0,PCPU(IBPB_SET) je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr andl $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax andl $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx wrmsr movb $0,PCPU(IBPB_SET) 1: ret END(handle_ibrs_exit) + +ENTRY(mds_handler_void) + ret +END(mds_handler_void) + +ENTRY(mds_handler_verw) + subl $4, %esp + movw %ds, (%esp) + verw (%esp) + addl $4, %esp + ret +END(mds_handler_verw) + +ENTRY(mds_handler_ivb) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %edx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + lfence + orpd (%edx), %xmm0 + orpd (%edx), %xmm0 + mfence + movl $40, %ecx + addl $16, %edx +2: movntdq %xmm0, (%edx) + addl $16, %edx + decl %ecx + jnz 2b + mfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_ivb) + +ENTRY(mds_handler_bdw) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %ebx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + movl %ebx, %edi + movl %ebx, %esi + movl $40, %ecx +2: movntdq %xmm0, (%ebx) + addl $16, %ebx + decl %ecx + jnz 2b + mfence + movl $1536, %ecx + rep; movsb + lfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_bdw) + +ENTRY(mds_handler_skl_sse) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %edi + movl PCPU(MDS_BUF64), %edx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + lfence + orpd (%edx), %xmm0 + orpd (%edx), %xmm0 + xorl %eax, %eax +2: clflushopt 5376(%edi, %eax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + movdqa PCPU(MDS_TMP), %xmm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_skl_sse) + +ENTRY(mds_handler_skl_avx) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %edi + movl PCPU(MDS_BUF64), %edx + vmovdqa %ymm0, PCPU(MDS_TMP) + vpxor %ymm0, %ymm0, %ymm0 + + lfence + vorpd (%edx), %ymm0, %ymm0 + vorpd (%edx), %ymm0, %ymm0 + xorl %eax, %eax +2: clflushopt 5376(%edi, %eax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + vmovdqa PCPU(MDS_TMP), %ymm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_skl_avx) + +ENTRY(mds_handler_skl_avx512) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %edi + movl PCPU(MDS_BUF64), %edx + vmovdqa64 %zmm0, PCPU(MDS_TMP) + vpxor %zmm0, %zmm0, %zmm0 + + lfence + vorpd (%edx), %zmm0, %zmm0 + vorpd (%edx), %zmm0, %zmm0 + xorl %eax, %eax +2: clflushopt 5376(%edi, %eax, 8) + addl $8, %eax + cmpl $8 * 12, %eax + jb 2b + sfence + movl $6144, %ecx + xorl %eax, %eax + rep; stosb + mfence + + vmovdqa64 PCPU(MDS_TMP), %zmm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_skl_avx512) + +ENTRY(mds_handler_silvermont) + movl %cr0, %eax + testb $CR0_TS, %al + je 1f + clts +1: movl PCPU(MDS_BUF), %edx + movdqa %xmm0, PCPU(MDS_TMP) + pxor %xmm0, %xmm0 + + movl $16, %ecx +2: movntdq %xmm0, (%edx) + addl $16, %edx + decl %ecx + jnz 2b + mfence + + movdqa PCPU(MDS_TMP),%xmm0 + testb $CR0_TS, %al + je 3f + movl %eax, %cr0 +3: ret +END(mds_handler_silvermont) Index: projects/runtime-coverage-v2/sys/i386/include/pcpu.h =================================================================== --- projects/runtime-coverage-v2/sys/i386/include/pcpu.h (revision 347598) +++ projects/runtime-coverage-v2/sys/i386/include/pcpu.h (revision 347599) @@ -1,267 +1,271 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifndef _SYS_CDEFS_H_ #error "sys/cdefs.h is a prerequisite for this file" #endif #include #include #include #include struct monitorbuf { int idle_state; /* Used by cpu_idle_mwait. */ int stop_state; /* Used by cpustop_handler. */ char padding[128 - (2 * sizeof(int))]; }; _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); /* * The SMP parts are setup in pmap.c and machdep.c for the BSP, and * pmap.c and mp_machdep.c sets up the data for the AP's to "see" when * they awake. The reason for doing it via a struct is so that an * array of pointers to each CPU's data can be set up for things like * "check curproc on all other processors" */ #define PCPU_MD_FIELDS \ struct monitorbuf pc_monitorbuf __aligned(128); /* cache line */\ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct segment_descriptor pc_common_tssd; \ struct segment_descriptor *pc_tss_gdt; \ struct segment_descriptor *pc_fsgs_gdt; \ struct i386tss *pc_common_tssp; \ u_int pc_kesp0; \ u_int pc_trampstk; \ int pc_currentldt; \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_apic_id; \ int pc_private_tss; /* Flag indicating private tss*/\ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ struct mtx pc_cmap_lock; \ void *pc_cmap_pte1; \ void *pc_cmap_pte2; \ caddr_t pc_cmap_addr1; \ caddr_t pc_cmap_addr2; \ vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\ vm_offset_t pc_copyout_maddr; \ vm_offset_t pc_copyout_saddr; \ struct mtx pc_copyout_mlock; \ struct sx pc_copyout_slock; \ char *pc_copyout_buf; \ vm_offset_t pc_pmap_eh_va; \ - caddr_t pc_pmap_eh_ptep; \ + caddr_t pc_pmap_eh_ptep; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ uint32_t pc_ibpb_set; \ + void *pc_mds_buf; \ + void *pc_mds_buf64; \ + uint32_t pc_pad[4]; \ + uint8_t pc_mds_tmp[64]; \ u_int pc_ipi_bitmap; \ - char __pad[3606] + char __pad[3518] #ifdef _KERNEL #define MONITOR_STOPSTATE_RUNNING 0 #define MONITOR_STOPSTATE_STOPPED 1 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* * Evaluates to the byte offset of the per-cpu variable name. */ #define __pcpu_offset(name) \ __offsetof(struct pcpu, name) /* * Evaluates to the type of the per-cpu variable name. */ #define __pcpu_type(name) \ __typeof(((struct pcpu *)0)->name) /* * Evaluates to the address of the per-cpu variable name. */ #define __PCPU_PTR(name) __extension__ ({ \ __pcpu_type(name) *__p; \ \ __asm __volatile("movl %%fs:%1,%0; addl %2,%0" \ : "=r" (__p) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ "i" (__pcpu_offset(name))); \ \ __p; \ }) /* * Evaluates to the value of the per-cpu variable name. */ #define __PCPU_GET(name) __extension__ ({ \ __pcpu_type(name) __res; \ struct __s { \ u_char __b[MIN(sizeof(__res), 4)]; \ } __s; \ \ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \ sizeof(__res) == 4) { \ __asm __volatile("mov %%fs:%1,%0" \ : "=r" (__s) \ : "m" (*(struct __s *)(__pcpu_offset(name)))); \ *(struct __s *)(void *)&__res = __s; \ } else { \ __res = *__PCPU_PTR(name); \ } \ __res; \ }) /* * Adds a value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_ADD(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__val), 4)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("add %1,%%fs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else \ *__PCPU_PTR(name) += __val; \ } while (0) /* * Increments the value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_INC(name) do { \ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \ sizeof(__pcpu_type(name)) == 2 || \ sizeof(__pcpu_type(name)) == 4); \ if (sizeof(__pcpu_type(name)) == 1) { \ __asm __volatile("incb %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 2) { \ __asm __volatile("incw %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 4) { \ __asm __volatile("incl %%fs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } \ } while (0) /* * Sets the value of the per-cpu variable name to value val. */ #define __PCPU_SET(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__val), 4)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("mov %1,%%fs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else { \ *__PCPU_PTR(name) = __val; \ } \ } while (0) #define get_pcpu() __extension__ ({ \ struct pcpu *__pc; \ \ __asm __volatile("movl %%fs:%1,%0" \ : "=r" (__pc) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace)))); \ __pc; \ }) #define PCPU_GET(member) __PCPU_GET(pc_ ## member) #define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val) #define PCPU_INC(member) __PCPU_INC(pc_ ## member) #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) #define OFFSETOF_CURTHREAD 0 #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnull-dereference" #endif static __inline __pure2 struct thread * __curthread(void) { struct thread *td; __asm("movl %%fs:%1,%0" : "=r" (td) : "m" (*(char *)OFFSETOF_CURTHREAD)); return (td); } #ifdef __clang__ #pragma clang diagnostic pop #endif #define curthread (__curthread()) #define OFFSETOF_CURPCB 16 static __inline __pure2 struct pcb * __curpcb(void) { struct pcb *pcb; __asm("movl %%fs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB)); return (pcb); } #define curpcb (__curpcb()) #define IS_BSP() (PCPU_GET(cpuid) == 0) #else /* defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */ #error "this file needs to be ported to your compiler" #endif /* __GNUCLIKE_ASM etc. */ #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ Index: projects/runtime-coverage-v2/sys/kern/subr_witness.c =================================================================== --- projects/runtime-coverage-v2/sys/kern/subr_witness.c (revision 347598) +++ projects/runtime-coverage-v2/sys/kern/subr_witness.c (revision 347599) @@ -1,3084 +1,3083 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ /* Define this to check for blessed mutexes */ #undef BLESSING #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; #ifdef BLESSING struct witness_blessed { const char *b_lock1; const char *b_lock2; }; #endif struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); #ifdef BLESSING static int blessed(struct witness *, struct witness *); #endif static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; #ifdef __i386__ static const char w_notallowed[] = "The sysctl is disabled on the arch\n"; #endif static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rm }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ { "in_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in_multi_list_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "ifnet_rw", &lock_class_rw }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ { "in6_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in6_multi_list_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, { "ifnet_rw", &lock_class_rw }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udp", &lock_class_mtx_sleep }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcp", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_sx }, - { "bpf interface lock", &lock_class_rw }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VFS namecache */ { "ncvn", &lock_class_mtx_sleep }, { "ncbuc", &lock_class_rw }, { "vnode interlock", &lock_class_mtx_sleep }, { "ncneg", &lock_class_mtx_sleep }, { NULL, NULL }, /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * TCP log locks */ { "TCP ID tree", &lock_class_rw }, { "tcp log id bucket", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "TCP log expireq", &lock_class_mtx_sleep }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, { "sio", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif #ifdef __sparc64__ { "pcib_mtx", &lock_class_mtx_spin }, { "rtc_mtx", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, { "process slock", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif { NULL, NULL }, { "sched lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { NULL, NULL }, /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #if defined(SMP) && defined(__sparc64__) { "ipi", &lock_class_mtx_spin }, #endif #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; #ifdef BLESSING /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; #endif /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * Calculate the size of early witness structures. */ int witness_startup_count(void) { int sz; sz = sizeof(struct witness) * witness_count; sz += sizeof(*w_rmatrix) * (witness_count + 1); sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) * (witness_count + 1); return (sz); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ void witness_startup(void *mem) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; uintptr_t p; int i; p = (uintptr_t)mem; w_data = (void *)p; p += sizeof(struct witness) * witness_count; w_rmatrix = (void *)p; p += sizeof(*w_rmatrix) * (witness_count + 1); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = (void *)p; p += sizeof(*w_rmatrix[i]) * (witness_count + 1); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || panicstr != NULL) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't * look for other lock order violations though, which * may be a bug. */ if (blessed(w, w1)) goto out; #endif /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } else { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, lock2->li_lock->lo_witness->w_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } witness_debugger(1, __func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags = LI_EXCLUSIVE; else instance->li_flags = 0; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || panicstr != NULL) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || panicstr != NULL) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, printf); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, printf); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; MPASS(description != NULL); if (witness_watch == -1 || panicstr != NULL) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; if (w->w_refcount == 1) w->w_class = lock_class; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; } else { w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } #ifdef BLESSING static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } #endif static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } DB_SHOW_ALIAS(alllocks, db_witness_list_all) DB_SHOW_COMMAND(witness, db_witness_display) { witness_ddb_display(db_printf); } #endif static void sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; int generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_printf(sb, "\n"); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_printf(sb, "\n"); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); } static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); sbuf_print_witness_badstacks(sb, &req->oldidx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } #ifdef DDB static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { return (db_printf("%.*s", len, data)); } DB_SHOW_COMMAND(badstacks, db_witness_badstacks) { struct sbuf sb; char buffer[128]; size_t dummy; sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); sbuf_print_witness_badstacks(&sb, &dummy); sbuf_finish(&sb); } #endif static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; #ifdef __i386__ error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed)); return (error); #endif if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "\n"); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_zero(&data->wlod_stack); stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_zero(&st); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } Index: projects/runtime-coverage-v2/sys/net/if_tuntap.c =================================================================== --- projects/runtime-coverage-v2/sys/net/if_tuntap.c (revision 347598) +++ projects/runtime-coverage-v2/sys/net/if_tuntap.c (revision 347599) @@ -1,1718 +1,1718 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_VMIO_FLAG_MASK 0x0fff /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user open() */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface."); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation."); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev, struct tuntap_driver *); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); static int tun_clone_destroy(struct if_clone *, struct ifnet *); static d_open_t tunopen; static d_close_t tunclose; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_t *clone_match_fn; ifc_create_t *clone_create_fn; ifc_destroy_t *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that/s okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, "%s%d", drv->cdevsw.d_name, unit); } tuncreate(dev, drv); return (0); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } /* No preexisting struct cdev *, create one */ *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, UID_UUCP, GID_DIALER, 0600, "%s", name); } if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if ((tp->tun_flags & TUN_OPEN) != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); - sx_xlock(&tun_ioctl_sx); - TUN2IFP(tp)->if_softc = NULL; - sx_xunlock(&tun_ioctl_sx); destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } + sx_xlock(&tun_ioctl_sx); + TUN2IFP(tp)->if_softc = NULL; + sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, drv->clone_match_fn, drv->clone_create_fn, drv->clone_destroy_fn); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(dev_clone, tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev, struct tuntap_driver *drv) { struct tuntap_softc *sc; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); sc->tun_flags = drv->ident_flags; sc->tun_dev = dev; sc->tun_drv = drv; mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); mtx_unlock(&tunmtx); iflags = IFF_MULTICAST; if ((sc->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = sc->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = sc; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((sc->tun_flags & TUN_L2) != 0) { ifp->if_mtu = ETHERMTU; ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } dev->si_drv1 = sc; TUN_LOCK(sc); sc->tun_flags |= TUN_INITED; TUN_UNLOCK(sc); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_driver *drv; struct tuntap_softc *tp; int error, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } if ((tunflags & TUN_L2) != 0) { /* Restrict? */ if (tap_allow_uopen == 0) { error = priv_check(td, PRIV_NET_TAP); if (error != 0) { CURVNET_RESTORE(); return (error); } } } /* * XXXRW: Non-atomic test and set of dev->si_drv1 requires * synchronization. */ tp = dev->si_drv1; if (!tp) { drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) { CURVNET_RESTORE(); return (ENXIO); } tuncreate(dev, drv); tp = dev->si_drv1; } TUN_LOCK(tp); if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); CURVNET_RESTORE(); return (0); } /* * tunclose - close the device - mark i/f down & delete * routing info */ static int tunclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Simply close the device if this isn't the controlling process. This * may happen if, for instance, the tunnel has been handed off to * another process. The original controller should be able to close it * without putting us into an inconsistent state. */ if (td->td_proc->p_pid != tp->tun_pid) { TUN_UNLOCK(tp); return (0); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } else { rtinit(ifa, (int)RTM_DELETE, 0); } } if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; cv_broadcast(&tp->tun_cv); TUN_UNLOCK(tp); return (0); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; #ifdef INET struct ifaddr *ifa; #endif TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); #ifdef INET if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *si; si = (struct sockaddr_in *)ifa->ifa_addr; if (si->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } } if_addr_runlock(ifp); #endif TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct tuninfo *tunp; int error, iflags; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; #endif bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { case TAPGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); TUN2IFP(tp)->if_flags = iflags | (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; int error=0, len; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; do { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } } while (m == NULL); TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) { struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; align = 0; mru = l2tun ? TAPMRU : TUNMRU; if (l2tun) align = ETHER_ALIGN; else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } Index: projects/runtime-coverage-v2/sys/netinet/in_mcast.c =================================================================== --- projects/runtime-coverage-v2/sys/netinet/in_mcast.c (revision 347598) +++ projects/runtime-coverage-v2/sys/netinet/in_mcast.c (revision 347599) @@ -1,3162 +1,3142 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); int ifma_restart; /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif static struct grouptask free_gtask; static struct in_multi_head inm_free_list; static void inm_release_task(void *arg __unused); static void inm_init(void) { SLIST_INIT(&inm_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); } #ifdef EARLY_AP_STARTUP SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, inm_init, NULL); #else SYSINIT(inm_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_FIRST, inm_init, NULL); #endif void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void inm_disconnect(struct in_multi *inm) { struct ifnet *ifp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->inm_ifp; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->inm_ifma; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; } } } void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct epoch_tracker et; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); NET_EPOCH_ENTER(et); inm = inm_lookup_locked(ifp, ina); NET_EPOCH_EXIT(et); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); inm_release_deferred(inm); } else { *pinm = inm; } IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; error = 0; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ + IN_MULTI_LOCK(); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ - IN_MULTI_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); -out_in_multi_locked: - - IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } static void inp_gcmoptions(struct ip_moptions *imo) { - struct in_mfilter *imf; + struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; - size_t idx, nmships; + size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); inm = imo->imo_membership[idx]; ifp = inm->inm_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); (void)in_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in_leavegroup(inm, imf); } if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is deferred to the epoch callback task. */ void inp_freemoptions(struct ip_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { struct epoch_tracker et; mreqn.imr_ifindex = ifp->if_index; NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; NET_EPOCH_EXIT(et); } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(ina, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); else error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqn.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && mreqn.imr_ifindex != 0) ifp = ifnet_byindex(mreqn.imr_ifindex); else ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqn.imr_address); break; } case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = ssa->sin.sin_family = AF_INET; gsa->sin.sin_len = ssa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ssa->sin.sin_addr = mreqs.imr_sourceaddr; ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); - break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); + IN_MULTI_LOCK(); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN_MULTI_LOCK(); - if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imo_free; } inm_acquire(inm); + KASSERT(imo->imo_membership[idx] == NULL, + ("%s: imo_membership already allocated", __func__)); imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); - goto out_in_multi_locked; + goto out_imf_rollback; } } -out_in_multi_locked: - - IN_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); +out_imf_rollback: if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { inm = imo->imo_membership[idx]; if (inm != NULL) { IN_MULTI_LIST_LOCK(); inm_release_deferred(inm); IN_MULTI_LIST_UNLOCK(); } imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; struct rm_priotracker in_ifa_tracker; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(mreqs.imr_interface, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); } CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ + IN_MULTI_LOCK(); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } -out_in_multi_locked: - - IN_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - +out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ KASSERT(RB_EMPTY(&imf->imf_sources), - ("%s: imf_sources not empty", __func__)); + ("%s: imf_sources (%p %p %zu) not empty", __func__, imf, imo, idx)); for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx - 1] = imo->imo_membership[idx]; imo->imo_mfilters[idx - 1] = imo->imo_mfilters[idx]; } imf_init(&imo->imo_mfilters[idx - 1], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { IN_IFADDR_RLOCK(&in_ifa_tracker); INADDR_TO_IFP(addr, ifp); IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. - * - * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ + IN_MULTI_LOCK(); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); - IN_MULTI_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); -out_in_multi_locked: - - IN_MULTI_UNLOCK(); - out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); IN_MULTI_LIST_LOCK(); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } NET_EPOCH_EXIT(et); IN_MULTI_LIST_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { [MCAST_UNDEFINED] = "un", [MCAST_INCLUDE] = "in", [MCAST_EXCLUDE] = "ex", }; _Static_assert(MCAST_UNDEFINED == 0 && MCAST_EXCLUDE + 1 == nitems(inm_modestrs), "inm_modestrs: no longer matches #defines"); static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { [IGMP_NOT_MEMBER] = "not-member", [IGMP_SILENT_MEMBER] = "silent", [IGMP_REPORTING_MEMBER] = "reporting", [IGMP_IDLE_MEMBER] = "idle", [IGMP_LAZY_MEMBER] = "lazy", [IGMP_SLEEPING_MEMBER] = "sleeping", [IGMP_AWAKENING_MEMBER] = "awakening", [IGMP_G_QUERY_PENDING_MEMBER] = "query-pending", [IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending", [IGMP_LEAVING_MEMBER] = "leaving", }; _Static_assert(IGMP_NOT_MEMBER == 0 && IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs), "inm_statetrs: no longer matches #defines"); static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c =================================================================== --- projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c (revision 347598) +++ projects/runtime-coverage-v2/sys/netinet6/in6_mcast.c (revision 347599) @@ -1,2999 +1,2984 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in6_multi_list_mtx; MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); struct mtx in6_multi_free_mtx; MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); struct sx in6_multi_sx; SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static int im6o_grow(struct ip6_moptions *); static size_t im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(const struct ip6_moptions *, const size_t, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } /* * Resize the ip6_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int im6o_grow(struct ip6_moptions *imo) { struct in6_multi **nmships; struct in6_multi **omships; struct in6_mfilter *nmfilters; struct in6_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->im6o_membership; omfilters = imo->im6o_mfilters; oldmax = imo->im6o_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IPV6_MAX_MEMBERSHIPS) { nmships = (struct in6_multi **)realloc(omships, sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT); nmfilters = (struct in6_mfilter *)realloc(omfilters, sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { im6f_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->im6o_max_memberships = newmax; imo->im6o_membership = nmships; imo->im6o_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IP6MOPTS); if (nmfilters != NULL) free(nmfilters, M_IN6MFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_multi **pinm; int idx; int nmships; gsin6 = (const struct sockaddr_in6 *)group; /* The im6o_membership array may be lazy allocated. */ if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) return (-1); nmships = imo->im6o_num_memberships; pinm = &imo->im6o_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr, &gsin6->sin6_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip6_msource find; struct in6_mfilter *imf; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The im6o_mfilters array may be lazy allocated. */ if (imo->im6o_mfilters == NULL) return (NULL); imf = &imo->im6o_mfilters[gidx]; psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = im6o_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->im6o_mfilters[gidx].im6f_st[1]; ims = im6o_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct epoch_tracker et; struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, group); NET_EPOCH_EXIT(et); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ static void in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; ifp = inm->in6m_ifp; MPASS(ifma->ifma_llifma == NULL); /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == NULL, ("%s: ifma_protospec != NULL", __func__)); if (ifp == NULL) ifp = ifma->ifma_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); } } static struct grouptask free_gtask; static struct in6_multi_head in6m_free_list; static void in6m_release_task(void *arg __unused); static void in6m_init(void) { SLIST_INIT(&in6m_free_list); taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task"); } #ifdef EARLY_AP_STARTUP SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, in6m_init, NULL); #else SYSINIT(in6m_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_SECOND, in6m_init, NULL); #endif void in6m_release_list_deferred(struct in6_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); GROUPTASK_ENQUEUE(&free_gtask); } void in6m_release_wait(void) { /* Wait for all jobs to complete. */ gtaskqueue_drain_all(free_gtask.gt_taskqueue); } void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; if (ifp == NULL) return; /* already called */ inm->in6m_ifp = NULL; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->in6m_ifma; if (ifma == NULL) return; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); } } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (void *)ifa; LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships, i6mm_chain, imm_tmp) { if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); in6m_rele_locked(inmh, inm); } } } } static void in6m_release_task(void *arg __unused) { struct in6_multi_head in6m_free_tmp; struct in6_multi *inm, *tinm; SLIST_INIT(&in6m_free_tmp); mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); IN6_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); in6m_release(inm); } IN6_MULTI_UNLOCK(); } /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ int in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ int in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_multi_head inmh; struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in6_getmulti(ifp, mcaddr, &inm); if (error) { CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: SLIST_INIT(&inmh); if (error) { struct epoch_tracker et; CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); NET_EPOCH_EXIT(et); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_multi_head inmh; struct in6_mfilter timf; struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = 0; if (ifp) error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) IF_ADDR_WLOCK(ifp); SLIST_INIT(&inmh); if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); if (ifp) IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; size_t idx; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } KASSERT(imo->im6o_mfilters != NULL, ("%s: im6o_mfilters not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; inm = imo->im6o_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; struct in6_multi **immp; struct in6_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, M_IN6MFILTER, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; imo->im6o_num_memberships = 0; imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; imo->im6o_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->im6o_mfilters = imfp; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imfp, M_IN6MFILTER); free(immp, M_IP6MOPTS); free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. * * XXX can all be safely deferred to epoch_call * */ static void inp_gcmoptions(struct ip6_moptions *imo) { struct in6_mfilter *imf; struct in6_multi *inm; struct ifnet *ifp; size_t idx, nmships; nmships = imo->im6o_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL; if (imf) im6f_leave(imf); inm = imo->im6o_membership[idx]; ifp = inm->in6m_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); (void)in6_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in6_leavegroup(inm, imf); } if (imf) im6f_purge(imf); } if (imo->im6o_mfilters) free(imo->im6o_mfilters, M_IN6MFILTER); free(imo->im6o_membership, M_IP6MOPTS); free(imo, M_IP6MOPTS); } void ip6_freemoptions(struct ip6_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->im6o_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. Look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *in6p, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(in6p->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); return (nh6.nh_ifp); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct in6_multi_head inmh; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; size_t idx; int error, is_new; SLIST_INIT(&inmh); ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { if (V_if_index < mreq.ipv6mr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(mreq.ipv6mr_interface); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + IN6_MULTI_LOCK(); imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->im6o_num_memberships == imo->im6o_max_memberships) { error = im6o_grow(imo); if (error) goto out_in6p_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->im6o_num_memberships; imo->im6o_membership[idx] = NULL; imo->im6o_num_memberships++; KASSERT(imo->im6o_mfilters != NULL, ("%s: im6f_mfilters vector was not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_im6o_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at MLD layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN6_MULTI_LOCK(); - if (is_new) { error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); if (error) { IN6_MULTI_UNLOCK(); goto out_im6o_free; } /* * NOTE: Refcount from in6_joingroup_locked() * is protecting membership. */ imo->im6o_membership[idx] = inm; } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); } - IN6_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); if (error) { im6f_rollback(imf); if (is_new) im6f_purge(imf); else im6f_reap(imf); } else { im6f_commit(imf); } out_im6o_free: if (error && is_new) { inm = imo->im6o_membership[idx]; if (inm != NULL) { IN6_MULTI_LIST_LOCK(); in6m_rele_locked(&inmh, inm); IN6_MULTI_LIST_UNLOCK(); } imo->im6o_membership[idx] = NULL; --imo->im6o_num_memberships; } out_in6p_locked: INP_WUNLOCK(inp); + IN6_MULTI_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; size_t idx; int error, is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { if (V_if_index < ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { ifp = ifnet_byindex(ifindex); } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); /* * Find the membership in the membership array. */ + IN6_MULTI_LOCK(); imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { im6f_leave(imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN6_MULTI_LOCK(); - if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in6_leavegroup_locked(inm, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); } - IN6_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); if (is_final) { /* Remove the gap in the membership array. */ KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); for (++idx; idx < imo->im6o_num_memberships; ++idx) { imo->im6o_membership[idx - 1] = imo->im6o_membership[idx]; imo->im6o_mfilters[idx - 1] = imo->im6o_mfilters[idx]; } im6f_init(&imo->im6o_mfilters[idx - 1], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->im6o_num_memberships--; } out_in6p_locked: INP_WUNLOCK(inp); + IN6_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); if (V_if_index < ifindex) return (EINVAL); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EINVAL); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); } imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. - * - * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_MLD, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); if (retval) return (retval); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } NET_EPOCH_EXIT(et); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ Index: projects/runtime-coverage-v2/sys/sys/param.h =================================================================== --- projects/runtime-coverage-v2/sys/sys/param.h (revision 347598) +++ projects/runtime-coverage-v2/sys/sys/param.h (revision 347599) @@ -1,367 +1,367 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300025 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300026 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: projects/runtime-coverage-v2/sys/x86/include/specialreg.h =================================================================== --- projects/runtime-coverage-v2/sys/x86/include/specialreg.h (revision 347598) +++ projects/runtime-coverage-v2/sys/x86/include/specialreg.h (revision 347599) @@ -1,1118 +1,1120 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 * $FreeBSD$ */ #ifndef _MACHINE_SPECIALREG_H_ #define _MACHINE_SPECIALREG_H_ /* * Bits in 386 special registers: */ #define CR0_PE 0x00000001 /* Protected mode Enable */ #define CR0_MP 0x00000002 /* "Math" (fpu) Present */ #define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ #define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */ #define CR0_PG 0x80000000 /* PaGing enable */ /* * Bits in 486 special registers: */ #define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ #define CR0_WP 0x00010000 /* Write Protect (honor page protect in all modes) */ #define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */ #define CR0_NW 0x20000000 /* Not Write-through */ #define CR0_CD 0x40000000 /* Cache Disable */ #define CR3_PCID_SAVE 0x8000000000000000 #define CR3_PCID_MASK 0xfff /* * Bits in PPro special registers */ #define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ #define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ #define CR4_TSD 0x00000004 /* Time stamp disable */ #define CR4_DE 0x00000008 /* Debugging extensions */ #define CR4_PSE 0x00000010 /* Page size extensions */ #define CR4_PAE 0x00000020 /* Physical address extension */ #define CR4_MCE 0x00000040 /* Machine check enable */ #define CR4_PGE 0x00000080 /* Page global enable */ #define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ #define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ #define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ #define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ #define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ #define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ #define CR4_PKE 0x00400000 /* Protection Keys Enable */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. */ #define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ #define EFER_LME 0x000000100 /* Long mode enable (R/W) */ #define EFER_LMA 0x000000400 /* Long mode active (R) */ #define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ #define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ #define EFER_LMSLE 0x000002000 /* Long Mode Segment Limit Enable */ #define EFER_FFXSR 0x000004000 /* Fast FXSAVE/FSRSTOR */ #define EFER_TCE 0x000008000 /* Translation Cache Extension */ /* * Intel Extended Features registers */ #define XCR0 0 /* XFEATURE_ENABLED_MASK register */ #define XFEATURE_ENABLED_X87 0x00000001 #define XFEATURE_ENABLED_SSE 0x00000002 #define XFEATURE_ENABLED_YMM_HI128 0x00000004 #define XFEATURE_ENABLED_AVX XFEATURE_ENABLED_YMM_HI128 #define XFEATURE_ENABLED_BNDREGS 0x00000008 #define XFEATURE_ENABLED_BNDCSR 0x00000010 #define XFEATURE_ENABLED_OPMASK 0x00000020 #define XFEATURE_ENABLED_ZMM_HI256 0x00000040 #define XFEATURE_ENABLED_HI16_ZMM 0x00000080 #define XFEATURE_AVX \ (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX) #define XFEATURE_AVX512 \ (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 | \ XFEATURE_ENABLED_HI16_ZMM) #define XFEATURE_MPX \ (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR) /* * CPUID instruction features register */ #define CPUID_FPU 0x00000001 #define CPUID_VME 0x00000002 #define CPUID_DE 0x00000004 #define CPUID_PSE 0x00000008 #define CPUID_TSC 0x00000010 #define CPUID_MSR 0x00000020 #define CPUID_PAE 0x00000040 #define CPUID_MCE 0x00000080 #define CPUID_CX8 0x00000100 #define CPUID_APIC 0x00000200 #define CPUID_B10 0x00000400 #define CPUID_SEP 0x00000800 #define CPUID_MTRR 0x00001000 #define CPUID_PGE 0x00002000 #define CPUID_MCA 0x00004000 #define CPUID_CMOV 0x00008000 #define CPUID_PAT 0x00010000 #define CPUID_PSE36 0x00020000 #define CPUID_PSN 0x00040000 #define CPUID_CLFSH 0x00080000 #define CPUID_B20 0x00100000 #define CPUID_DS 0x00200000 #define CPUID_ACPI 0x00400000 #define CPUID_MMX 0x00800000 #define CPUID_FXSR 0x01000000 #define CPUID_SSE 0x02000000 #define CPUID_XMM 0x02000000 #define CPUID_SSE2 0x04000000 #define CPUID_SS 0x08000000 #define CPUID_HTT 0x10000000 #define CPUID_TM 0x20000000 #define CPUID_IA64 0x40000000 #define CPUID_PBE 0x80000000 #define CPUID2_SSE3 0x00000001 #define CPUID2_PCLMULQDQ 0x00000002 #define CPUID2_DTES64 0x00000004 #define CPUID2_MON 0x00000008 #define CPUID2_DS_CPL 0x00000010 #define CPUID2_VMX 0x00000020 #define CPUID2_SMX 0x00000040 #define CPUID2_EST 0x00000080 #define CPUID2_TM2 0x00000100 #define CPUID2_SSSE3 0x00000200 #define CPUID2_CNXTID 0x00000400 #define CPUID2_SDBG 0x00000800 #define CPUID2_FMA 0x00001000 #define CPUID2_CX16 0x00002000 #define CPUID2_XTPR 0x00004000 #define CPUID2_PDCM 0x00008000 #define CPUID2_PCID 0x00020000 #define CPUID2_DCA 0x00040000 #define CPUID2_SSE41 0x00080000 #define CPUID2_SSE42 0x00100000 #define CPUID2_X2APIC 0x00200000 #define CPUID2_MOVBE 0x00400000 #define CPUID2_POPCNT 0x00800000 #define CPUID2_TSCDLT 0x01000000 #define CPUID2_AESNI 0x02000000 #define CPUID2_XSAVE 0x04000000 #define CPUID2_OSXSAVE 0x08000000 #define CPUID2_AVX 0x10000000 #define CPUID2_F16C 0x20000000 #define CPUID2_RDRAND 0x40000000 #define CPUID2_HV 0x80000000 /* * Important bits in the Thermal and Power Management flags * CPUID.6 EAX and ECX. */ #define CPUTPM1_SENSOR 0x00000001 #define CPUTPM1_TURBO 0x00000002 #define CPUTPM1_ARAT 0x00000004 #define CPUTPM1_HWP 0x00000080 #define CPUTPM1_HWP_NOTIFICATION 0x00000100 #define CPUTPM1_HWP_ACTIVITY_WINDOW 0x00000200 #define CPUTPM1_HWP_PERF_PREF 0x00000400 #define CPUTPM1_HWP_PKG 0x00000800 #define CPUTPM1_HWP_FLEXIBLE 0x00020000 #define CPUTPM2_EFFREQ 0x00000001 /* Intel Processor Trace CPUID. */ /* Leaf 0 ebx. */ #define CPUPT_CR3 (1 << 0) /* CR3 Filtering Support */ #define CPUPT_PSB (1 << 1) /* Configurable PSB and Cycle-Accurate Mode Supported */ #define CPUPT_IPF (1 << 2) /* IP Filtering and TraceStop supported */ #define CPUPT_MTC (1 << 3) /* MTC Supported */ #define CPUPT_PRW (1 << 4) /* PTWRITE Supported */ #define CPUPT_PWR (1 << 5) /* Power Event Trace Supported */ /* Leaf 0 ecx. */ #define CPUPT_TOPA (1 << 0) /* ToPA Output Supported */ #define CPUPT_TOPA_MULTI (1 << 1) /* ToPA Tables Allow Multiple Output Entries */ #define CPUPT_SINGLE (1 << 2) /* Single-Range Output Supported */ #define CPUPT_TT_OUT (1 << 3) /* Output to Trace Transport Subsystem Supported */ #define CPUPT_LINEAR_IP (1 << 31) /* IP Payloads are Linear IP, otherwise IP is effective */ /* Leaf 1 eax. */ #define CPUPT_NADDR_S 0 /* Number of Address Ranges */ #define CPUPT_NADDR_M (0x7 << CPUPT_NADDR_S) #define CPUPT_MTC_BITMAP_S 16 /* Bitmap of supported MTC Period Encodings */ #define CPUPT_MTC_BITMAP_M (0xffff << CPUPT_MTC_BITMAP_S) /* Leaf 1 ebx. */ #define CPUPT_CT_BITMAP_S 0 /* Bitmap of supported Cycle Threshold values */ #define CPUPT_CT_BITMAP_M (0xffff << CPUPT_CT_BITMAP_S) #define CPUPT_PFE_BITMAP_S 16 /* Bitmap of supported Configurable PSB Frequency encoding */ #define CPUPT_PFE_BITMAP_M (0xffff << CPUPT_PFE_BITMAP_S) /* * Important bits in the AMD extended cpuid flags */ #define AMDID_SYSCALL 0x00000800 #define AMDID_MP 0x00080000 #define AMDID_NX 0x00100000 #define AMDID_EXT_MMX 0x00400000 #define AMDID_FFXSR 0x02000000 #define AMDID_PAGE1GB 0x04000000 #define AMDID_RDTSCP 0x08000000 #define AMDID_LM 0x20000000 #define AMDID_EXT_3DNOW 0x40000000 #define AMDID_3DNOW 0x80000000 #define AMDID2_LAHF 0x00000001 #define AMDID2_CMP 0x00000002 #define AMDID2_SVM 0x00000004 #define AMDID2_EXT_APIC 0x00000008 #define AMDID2_CR8 0x00000010 #define AMDID2_ABM 0x00000020 #define AMDID2_SSE4A 0x00000040 #define AMDID2_MAS 0x00000080 #define AMDID2_PREFETCH 0x00000100 #define AMDID2_OSVW 0x00000200 #define AMDID2_IBS 0x00000400 #define AMDID2_XOP 0x00000800 #define AMDID2_SKINIT 0x00001000 #define AMDID2_WDT 0x00002000 #define AMDID2_LWP 0x00008000 #define AMDID2_FMA4 0x00010000 #define AMDID2_TCE 0x00020000 #define AMDID2_NODE_ID 0x00080000 #define AMDID2_TBM 0x00200000 #define AMDID2_TOPOLOGY 0x00400000 #define AMDID2_PCXC 0x00800000 #define AMDID2_PNXC 0x01000000 #define AMDID2_DBE 0x04000000 #define AMDID2_PTSC 0x08000000 #define AMDID2_PTSCEL2I 0x10000000 #define AMDID2_MWAITX 0x20000000 /* * CPUID instruction 1 eax info */ #define CPUID_STEPPING 0x0000000f #define CPUID_MODEL 0x000000f0 #define CPUID_FAMILY 0x00000f00 #define CPUID_EXT_MODEL 0x000f0000 #define CPUID_EXT_FAMILY 0x0ff00000 #ifdef __i386__ #define CPUID_TO_MODEL(id) \ ((((id) & CPUID_MODEL) >> 4) | \ ((((id) & CPUID_FAMILY) >= 0x600) ? \ (((id) & CPUID_EXT_MODEL) >> 12) : 0)) #define CPUID_TO_FAMILY(id) \ ((((id) & CPUID_FAMILY) >> 8) + \ ((((id) & CPUID_FAMILY) == 0xf00) ? \ (((id) & CPUID_EXT_FAMILY) >> 20) : 0)) #else #define CPUID_TO_MODEL(id) \ ((((id) & CPUID_MODEL) >> 4) | \ (((id) & CPUID_EXT_MODEL) >> 12)) #define CPUID_TO_FAMILY(id) \ ((((id) & CPUID_FAMILY) >> 8) + \ (((id) & CPUID_EXT_FAMILY) >> 20)) #endif /* * CPUID instruction 1 ebx info */ #define CPUID_BRAND_INDEX 0x000000ff #define CPUID_CLFUSH_SIZE 0x0000ff00 #define CPUID_HTT_CORES 0x00ff0000 #define CPUID_LOCAL_APIC_ID 0xff000000 /* * CPUID instruction 5 info */ #define CPUID5_MON_MIN_SIZE 0x0000ffff /* eax */ #define CPUID5_MON_MAX_SIZE 0x0000ffff /* ebx */ #define CPUID5_MON_MWAIT_EXT 0x00000001 /* ecx */ #define CPUID5_MWAIT_INTRBREAK 0x00000002 /* ecx */ /* * MWAIT cpu power states. Lower 4 bits are sub-states. */ #define MWAIT_C0 0xf0 #define MWAIT_C1 0x00 #define MWAIT_C2 0x10 #define MWAIT_C3 0x20 #define MWAIT_C4 0x30 /* * MWAIT extensions. */ /* Interrupt breaks MWAIT even when masked. */ #define MWAIT_INTRBREAK 0x00000001 /* * CPUID instruction 6 ecx info */ #define CPUID_PERF_STAT 0x00000001 #define CPUID_PERF_BIAS 0x00000008 /* * CPUID instruction 0xb ebx info. */ #define CPUID_TYPE_INVAL 0 #define CPUID_TYPE_SMT 1 #define CPUID_TYPE_CORE 2 /* * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1 */ #define CPUID_EXTSTATE_XSAVEOPT 0x00000001 #define CPUID_EXTSTATE_XSAVEC 0x00000002 #define CPUID_EXTSTATE_XINUSE 0x00000004 #define CPUID_EXTSTATE_XSAVES 0x00000008 /* * AMD extended function 8000_0007h ebx info */ #define AMDRAS_MCA_OF_RECOV 0x00000001 #define AMDRAS_SUCCOR 0x00000002 #define AMDRAS_HW_ASSERT 0x00000004 #define AMDRAS_SCALABLE_MCA 0x00000008 #define AMDRAS_PFEH_SUPPORT 0x00000010 /* * AMD extended function 8000_0007h edx info */ #define AMDPM_TS 0x00000001 #define AMDPM_FID 0x00000002 #define AMDPM_VID 0x00000004 #define AMDPM_TTP 0x00000008 #define AMDPM_TM 0x00000010 #define AMDPM_STC 0x00000020 #define AMDPM_100MHZ_STEPS 0x00000040 #define AMDPM_HW_PSTATE 0x00000080 #define AMDPM_TSC_INVARIANT 0x00000100 #define AMDPM_CPB 0x00000200 /* * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions) */ #define AMDFEID_CLZERO 0x00000001 #define AMDFEID_IRPERF 0x00000002 #define AMDFEID_XSAVEERPTR 0x00000004 #define AMDFEID_IBPB 0x00001000 #define AMDFEID_IBRS 0x00004000 #define AMDFEID_STIBP 0x00008000 /* The below are only defined if the corresponding base feature above exists. */ #define AMDFEID_IBRS_ALWAYSON 0x00010000 #define AMDFEID_STIBP_ALWAYSON 0x00020000 #define AMDFEID_PREFER_IBRS 0x00040000 #define AMDFEID_SSBD 0x01000000 /* SSBD via MSRC001_011F instead of MSR 0x48: */ #define AMDFEID_VIRT_SSBD 0x02000000 #define AMDFEID_SSB_NO 0x04000000 /* * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff #define AMDID_COREID_SIZE 0x0000f000 #define AMDID_COREID_SIZE_SHIFT 12 /* * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info */ #define CPUID_STDEXT_FSGSBASE 0x00000001 #define CPUID_STDEXT_TSC_ADJUST 0x00000002 #define CPUID_STDEXT_SGX 0x00000004 #define CPUID_STDEXT_BMI1 0x00000008 #define CPUID_STDEXT_HLE 0x00000010 #define CPUID_STDEXT_AVX2 0x00000020 #define CPUID_STDEXT_FDP_EXC 0x00000040 #define CPUID_STDEXT_SMEP 0x00000080 #define CPUID_STDEXT_BMI2 0x00000100 #define CPUID_STDEXT_ERMS 0x00000200 #define CPUID_STDEXT_INVPCID 0x00000400 #define CPUID_STDEXT_RTM 0x00000800 #define CPUID_STDEXT_PQM 0x00001000 #define CPUID_STDEXT_NFPUSG 0x00002000 #define CPUID_STDEXT_MPX 0x00004000 #define CPUID_STDEXT_PQE 0x00008000 #define CPUID_STDEXT_AVX512F 0x00010000 #define CPUID_STDEXT_AVX512DQ 0x00020000 #define CPUID_STDEXT_RDSEED 0x00040000 #define CPUID_STDEXT_ADX 0x00080000 #define CPUID_STDEXT_SMAP 0x00100000 #define CPUID_STDEXT_AVX512IFMA 0x00200000 #define CPUID_STDEXT_PCOMMIT 0x00400000 #define CPUID_STDEXT_CLFLUSHOPT 0x00800000 #define CPUID_STDEXT_CLWB 0x01000000 #define CPUID_STDEXT_PROCTRACE 0x02000000 #define CPUID_STDEXT_AVX512PF 0x04000000 #define CPUID_STDEXT_AVX512ER 0x08000000 #define CPUID_STDEXT_AVX512CD 0x10000000 #define CPUID_STDEXT_SHA 0x20000000 #define CPUID_STDEXT_AVX512BW 0x40000000 #define CPUID_STDEXT_AVX512VL 0x80000000 /* * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info */ #define CPUID_STDEXT2_PREFETCHWT1 0x00000001 #define CPUID_STDEXT2_UMIP 0x00000004 #define CPUID_STDEXT2_PKU 0x00000008 #define CPUID_STDEXT2_OSPKE 0x00000010 #define CPUID_STDEXT2_WAITPKG 0x00000020 #define CPUID_STDEXT2_GFNI 0x00000100 #define CPUID_STDEXT2_RDPID 0x00400000 #define CPUID_STDEXT2_CLDEMOTE 0x02000000 #define CPUID_STDEXT2_MOVDIRI 0x08000000 #define CPUID_STDEXT2_MOVDIRI64B 0x10000000 #define CPUID_STDEXT2_SGXLC 0x40000000 /* * CPUID instruction 7 Structured Extended Features, leaf 0 edx info */ +#define CPUID_STDEXT3_MD_CLEAR 0x00000400 #define CPUID_STDEXT3_TSXFA 0x00002000 #define CPUID_STDEXT3_IBPB 0x04000000 #define CPUID_STDEXT3_STIBP 0x08000000 #define CPUID_STDEXT3_L1D_FLUSH 0x10000000 #define CPUID_STDEXT3_ARCH_CAP 0x20000000 #define CPUID_STDEXT3_CORE_CAP 0x40000000 #define CPUID_STDEXT3_SSBD 0x80000000 /* MSR IA32_ARCH_CAP(ABILITIES) bits */ #define IA32_ARCH_CAP_RDCL_NO 0x00000001 #define IA32_ARCH_CAP_IBRS_ALL 0x00000002 #define IA32_ARCH_CAP_RSBA 0x00000004 #define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 #define IA32_ARCH_CAP_SSB_NO 0x00000010 +#define IA32_ARCH_CAP_MDS_NO 0x00000020 /* * CPUID manufacturers identifiers */ #define AMD_VENDOR_ID "AuthenticAMD" #define CENTAUR_VENDOR_ID "CentaurHauls" #define CYRIX_VENDOR_ID "CyrixInstead" #define INTEL_VENDOR_ID "GenuineIntel" #define NEXGEN_VENDOR_ID "NexGenDriven" #define NSC_VENDOR_ID "Geode by NSC" #define RISE_VENDOR_ID "RiseRiseRise" #define SIS_VENDOR_ID "SiS SiS SiS " #define TRANSMETA_VENDOR_ID "GenuineTMx86" #define UMC_VENDOR_ID "UMC UMC UMC " /* * Model-specific registers for the i386 family */ #define MSR_P5_MC_ADDR 0x000 #define MSR_P5_MC_TYPE 0x001 #define MSR_TSC 0x010 #define MSR_P5_CESR 0x011 #define MSR_P5_CTR0 0x012 #define MSR_P5_CTR1 0x013 #define MSR_IA32_PLATFORM_ID 0x017 #define MSR_APICBASE 0x01b #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a #define MSR_IA32_SPEC_CTRL 0x048 #define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 #define MSR_BBL_CR_D2 0x08a #define MSR_BIOS_SIGN 0x08b #define MSR_PERFCTR0 0x0c1 #define MSR_PERFCTR1 0x0c2 #define MSR_PLATFORM_INFO 0x0ce #define MSR_MPERF 0x0e7 #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe #define MSR_IA32_ARCH_CAP 0x10a #define MSR_IA32_FLUSH_CMD 0x10b #define MSR_TSX_FORCE_ABORT 0x10f #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 #define MSR_BBL_CR_TRIG 0x11a #define MSR_BBL_CR_BUSY 0x11b #define MSR_BBL_CR_CTL3 0x11e #define MSR_SYSENTER_CS_MSR 0x174 #define MSR_SYSENTER_ESP_MSR 0x175 #define MSR_SYSENTER_EIP_MSR 0x176 #define MSR_MCG_CAP 0x179 #define MSR_MCG_STATUS 0x17a #define MSR_MCG_CTL 0x17b #define MSR_EVNTSEL0 0x186 #define MSR_EVNTSEL1 0x187 #define MSR_THERM_CONTROL 0x19a #define MSR_THERM_INTERRUPT 0x19b #define MSR_THERM_STATUS 0x19c #define MSR_IA32_MISC_ENABLE 0x1a0 #define MSR_IA32_TEMPERATURE_TARGET 0x1a2 #define MSR_TURBO_RATIO_LIMIT 0x1ad #define MSR_TURBO_RATIO_LIMIT1 0x1ae #define MSR_DEBUGCTLMSR 0x1d9 #define MSR_LASTBRANCHFROMIP 0x1db #define MSR_LASTBRANCHTOIP 0x1dc #define MSR_LASTINTFROMIP 0x1dd #define MSR_LASTINTTOIP 0x1de #define MSR_ROB_CR_BKUPTMPDR6 0x1e0 #define MSR_MTRRVarBase 0x200 #define MSR_MTRR64kBase 0x250 #define MSR_MTRR16kBase 0x258 #define MSR_MTRR4kBase 0x268 #define MSR_PAT 0x277 #define MSR_MC0_CTL2 0x280 #define MSR_MTRRdefType 0x2ff #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 #define MSR_MC0_ADDR 0x402 #define MSR_MC0_MISC 0x403 #define MSR_MC1_CTL 0x404 #define MSR_MC1_STATUS 0x405 #define MSR_MC1_ADDR 0x406 #define MSR_MC1_MISC 0x407 #define MSR_MC2_CTL 0x408 #define MSR_MC2_STATUS 0x409 #define MSR_MC2_ADDR 0x40a #define MSR_MC2_MISC 0x40b #define MSR_MC3_CTL 0x40c #define MSR_MC3_STATUS 0x40d #define MSR_MC3_ADDR 0x40e #define MSR_MC3_MISC 0x40f #define MSR_MC4_CTL 0x410 #define MSR_MC4_STATUS 0x411 #define MSR_MC4_ADDR 0x412 #define MSR_MC4_MISC 0x413 #define MSR_RAPL_POWER_UNIT 0x606 #define MSR_PKG_ENERGY_STATUS 0x611 #define MSR_DRAM_ENERGY_STATUS 0x619 #define MSR_PP0_ENERGY_STATUS 0x639 #define MSR_PP1_ENERGY_STATUS 0x641 #define MSR_PPERF 0x64e #define MSR_TSC_DEADLINE 0x6e0 /* Writes are not serializing */ #define MSR_IA32_PM_ENABLE 0x770 #define MSR_IA32_HWP_CAPABILITIES 0x771 #define MSR_IA32_HWP_REQUEST_PKG 0x772 #define MSR_IA32_HWP_INTERRUPT 0x773 #define MSR_IA32_HWP_REQUEST 0x774 #define MSR_IA32_HWP_STATUS 0x777 /* * VMX MSRs */ #define MSR_VMX_BASIC 0x480 #define MSR_VMX_PINBASED_CTLS 0x481 #define MSR_VMX_PROCBASED_CTLS 0x482 #define MSR_VMX_EXIT_CTLS 0x483 #define MSR_VMX_ENTRY_CTLS 0x484 #define MSR_VMX_CR0_FIXED0 0x486 #define MSR_VMX_CR0_FIXED1 0x487 #define MSR_VMX_CR4_FIXED0 0x488 #define MSR_VMX_CR4_FIXED1 0x489 #define MSR_VMX_PROCBASED_CTLS2 0x48b #define MSR_VMX_EPT_VPID_CAP 0x48c #define MSR_VMX_TRUE_PINBASED_CTLS 0x48d #define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e #define MSR_VMX_TRUE_EXIT_CTLS 0x48f #define MSR_VMX_TRUE_ENTRY_CTLS 0x490 /* * X2APIC MSRs. * Writes are not serializing. */ #define MSR_APIC_000 0x800 #define MSR_APIC_ID 0x802 #define MSR_APIC_VERSION 0x803 #define MSR_APIC_TPR 0x808 #define MSR_APIC_EOI 0x80b #define MSR_APIC_LDR 0x80d #define MSR_APIC_SVR 0x80f #define MSR_APIC_ISR0 0x810 #define MSR_APIC_ISR1 0x811 #define MSR_APIC_ISR2 0x812 #define MSR_APIC_ISR3 0x813 #define MSR_APIC_ISR4 0x814 #define MSR_APIC_ISR5 0x815 #define MSR_APIC_ISR6 0x816 #define MSR_APIC_ISR7 0x817 #define MSR_APIC_TMR0 0x818 #define MSR_APIC_IRR0 0x820 #define MSR_APIC_ESR 0x828 #define MSR_APIC_LVT_CMCI 0x82F #define MSR_APIC_ICR 0x830 #define MSR_APIC_LVT_TIMER 0x832 #define MSR_APIC_LVT_THERMAL 0x833 #define MSR_APIC_LVT_PCINT 0x834 #define MSR_APIC_LVT_LINT0 0x835 #define MSR_APIC_LVT_LINT1 0x836 #define MSR_APIC_LVT_ERROR 0x837 #define MSR_APIC_ICR_TIMER 0x838 #define MSR_APIC_CCR_TIMER 0x839 #define MSR_APIC_DCR_TIMER 0x83e #define MSR_APIC_SELF_IPI 0x83f #define MSR_IA32_XSS 0xda0 /* * Intel Processor Trace (PT) MSRs. */ #define MSR_IA32_RTIT_OUTPUT_BASE 0x560 /* Trace Output Base Register (R/W) */ #define MSR_IA32_RTIT_OUTPUT_MASK_PTRS 0x561 /* Trace Output Mask Pointers Register (R/W) */ #define MSR_IA32_RTIT_CTL 0x570 /* Trace Control Register (R/W) */ #define RTIT_CTL_TRACEEN (1 << 0) #define RTIT_CTL_CYCEN (1 << 1) #define RTIT_CTL_OS (1 << 2) #define RTIT_CTL_USER (1 << 3) #define RTIT_CTL_PWREVTEN (1 << 4) #define RTIT_CTL_FUPONPTW (1 << 5) #define RTIT_CTL_FABRICEN (1 << 6) #define RTIT_CTL_CR3FILTER (1 << 7) #define RTIT_CTL_TOPA (1 << 8) #define RTIT_CTL_MTCEN (1 << 9) #define RTIT_CTL_TSCEN (1 << 10) #define RTIT_CTL_DISRETC (1 << 11) #define RTIT_CTL_PTWEN (1 << 12) #define RTIT_CTL_BRANCHEN (1 << 13) #define RTIT_CTL_MTC_FREQ_S 14 #define RTIT_CTL_MTC_FREQ(n) ((n) << RTIT_CTL_MTC_FREQ_S) #define RTIT_CTL_MTC_FREQ_M (0xf << RTIT_CTL_MTC_FREQ_S) #define RTIT_CTL_CYC_THRESH_S 19 #define RTIT_CTL_CYC_THRESH_M (0xf << RTIT_CTL_CYC_THRESH_S) #define RTIT_CTL_PSB_FREQ_S 24 #define RTIT_CTL_PSB_FREQ_M (0xf << RTIT_CTL_PSB_FREQ_S) #define RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4) #define RTIT_CTL_ADDR0_CFG_S 32 #define RTIT_CTL_ADDR0_CFG_M (0xfULL << RTIT_CTL_ADDR0_CFG_S) #define RTIT_CTL_ADDR1_CFG_S 36 #define RTIT_CTL_ADDR1_CFG_M (0xfULL << RTIT_CTL_ADDR1_CFG_S) #define RTIT_CTL_ADDR2_CFG_S 40 #define RTIT_CTL_ADDR2_CFG_M (0xfULL << RTIT_CTL_ADDR2_CFG_S) #define RTIT_CTL_ADDR3_CFG_S 44 #define RTIT_CTL_ADDR3_CFG_M (0xfULL << RTIT_CTL_ADDR3_CFG_S) #define MSR_IA32_RTIT_STATUS 0x571 /* Tracing Status Register (R/W) */ #define RTIT_STATUS_FILTEREN (1 << 0) #define RTIT_STATUS_CONTEXTEN (1 << 1) #define RTIT_STATUS_TRIGGEREN (1 << 2) #define RTIT_STATUS_ERROR (1 << 4) #define RTIT_STATUS_STOPPED (1 << 5) #define RTIT_STATUS_PACKETBYTECNT_S 32 #define RTIT_STATUS_PACKETBYTECNT_M (0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S) #define MSR_IA32_RTIT_CR3_MATCH 0x572 /* Trace Filter CR3 Match Register (R/W) */ #define MSR_IA32_RTIT_ADDR_A(n) (0x580 + (n) * 2) #define MSR_IA32_RTIT_ADDR_B(n) (0x581 + (n) * 2) #define MSR_IA32_RTIT_ADDR0_A 0x580 /* Region 0 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR0_B 0x581 /* Region 0 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR1_A 0x582 /* Region 1 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR1_B 0x583 /* Region 1 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR2_A 0x584 /* Region 2 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR2_B 0x585 /* Region 2 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR3_A 0x586 /* Region 3 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR3_B 0x587 /* Region 3 End Address (R/W) */ /* Intel Processor Trace Table of Physical Addresses (ToPA). */ #define TOPA_SIZE_S 6 #define TOPA_SIZE_M (0xf << TOPA_SIZE_S) #define TOPA_SIZE_4K (0 << TOPA_SIZE_S) #define TOPA_SIZE_8K (1 << TOPA_SIZE_S) #define TOPA_SIZE_16K (2 << TOPA_SIZE_S) #define TOPA_SIZE_32K (3 << TOPA_SIZE_S) #define TOPA_SIZE_64K (4 << TOPA_SIZE_S) #define TOPA_SIZE_128K (5 << TOPA_SIZE_S) #define TOPA_SIZE_256K (6 << TOPA_SIZE_S) #define TOPA_SIZE_512K (7 << TOPA_SIZE_S) #define TOPA_SIZE_1M (8 << TOPA_SIZE_S) #define TOPA_SIZE_2M (9 << TOPA_SIZE_S) #define TOPA_SIZE_4M (10 << TOPA_SIZE_S) #define TOPA_SIZE_8M (11 << TOPA_SIZE_S) #define TOPA_SIZE_16M (12 << TOPA_SIZE_S) #define TOPA_SIZE_32M (13 << TOPA_SIZE_S) #define TOPA_SIZE_64M (14 << TOPA_SIZE_S) #define TOPA_SIZE_128M (15 << TOPA_SIZE_S) #define TOPA_STOP (1 << 4) #define TOPA_INT (1 << 2) #define TOPA_END (1 << 0) /* * Constants related to MSR's. */ #define APICBASE_RESERVED 0x000002ff #define APICBASE_BSP 0x00000100 #define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 /* MSR_IA32_FEATURE_CONTROL related */ #define IA32_FEATURE_CONTROL_LOCK 0x01 /* lock bit */ #define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ #define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ /* MSR IA32_MISC_ENABLE */ #define IA32_MISC_EN_FASTSTR 0x0000000000000001ULL #define IA32_MISC_EN_ATCCE 0x0000000000000008ULL #define IA32_MISC_EN_PERFMON 0x0000000000000080ULL #define IA32_MISC_EN_PEBSU 0x0000000000001000ULL #define IA32_MISC_EN_ESSTE 0x0000000000010000ULL #define IA32_MISC_EN_MONE 0x0000000000040000ULL #define IA32_MISC_EN_LIMCPUID 0x0000000000400000ULL #define IA32_MISC_EN_xTPRD 0x0000000000800000ULL #define IA32_MISC_EN_XDD 0x0000000400000000ULL /* * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' * document 336996-001 Speculative Execution Side Channel Mitigations. * * AMD uses the same MSRs and bit definitions, as described in 111006-B * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass * Disable." */ /* MSR IA32_SPEC_CTRL */ #define IA32_SPEC_CTRL_IBRS 0x00000001 #define IA32_SPEC_CTRL_STIBP 0x00000002 #define IA32_SPEC_CTRL_SSBD 0x00000004 /* MSR IA32_PRED_CMD */ #define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL /* MSR IA32_FLUSH_CMD */ #define IA32_FLUSH_CMD_L1D 0x00000001 /* MSR IA32_HWP_CAPABILITIES */ #define IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x) (((x) >> 0) & 0xff) #define IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x) (((x) >> 8) & 0xff) #define IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x) (((x) >> 16) & 0xff) #define IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x) (((x) >> 24) & 0xff) /* MSR IA32_HWP_REQUEST */ #define IA32_HWP_REQUEST_MINIMUM_VALID (1ULL << 63) #define IA32_HWP_REQUEST_MAXIMUM_VALID (1ULL << 62) #define IA32_HWP_REQUEST_DESIRED_VALID (1ULL << 61) #define IA32_HWP_REQUEST_EPP_VALID (1ULL << 60) #define IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID (1ULL << 59) #define IA32_HWP_REQUEST_PACKAGE_CONTROL (1ULL << 42) #define IA32_HWP_ACTIVITY_WINDOW (0x3ffULL << 32) #define IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE (0xffULL << 24) #define IA32_HWP_DESIRED_PERFORMANCE (0xffULL << 16) #define IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE (0xffULL << 8) #define IA32_HWP_MINIMUM_PERFORMANCE (0xffULL << 0) /* * PAT modes. */ #define PAT_UNCACHEABLE 0x00 #define PAT_WRITE_COMBINING 0x01 #define PAT_WRITE_THROUGH 0x04 #define PAT_WRITE_PROTECTED 0x05 #define PAT_WRITE_BACK 0x06 #define PAT_UNCACHED 0x07 #define PAT_VALUE(i, m) ((long long)(m) << (8 * (i))) #define PAT_MASK(i) PAT_VALUE(i, 0xff) /* * Constants related to MTRRs */ #define MTRR_UNCACHEABLE 0x00 #define MTRR_WRITE_COMBINING 0x01 #define MTRR_WRITE_THROUGH 0x04 #define MTRR_WRITE_PROTECTED 0x05 #define MTRR_WRITE_BACK 0x06 #define MTRR_N64K 8 /* numbers of fixed-size entries */ #define MTRR_N16K 16 #define MTRR_N4K 64 #define MTRR_CAP_WC 0x0000000000000400 #define MTRR_CAP_FIXED 0x0000000000000100 #define MTRR_CAP_VCNT 0x00000000000000ff #define MTRR_DEF_ENABLE 0x0000000000000800 #define MTRR_DEF_FIXED_ENABLE 0x0000000000000400 #define MTRR_DEF_TYPE 0x00000000000000ff #define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000 #define MTRR_PHYSBASE_TYPE 0x00000000000000ff #define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000 #define MTRR_PHYSMASK_VALID 0x0000000000000800 /* * Cyrix configuration registers, accessible as IO ports. */ #define CCR0 0xc0 /* Configuration control register 0 */ #define CCR0_NC0 0x01 /* First 64K of each 1M memory region is non-cacheable */ #define CCR0_NC1 0x02 /* 640K-1M region is non-cacheable */ #define CCR0_A20M 0x04 /* Enables A20M# input pin */ #define CCR0_KEN 0x08 /* Enables KEN# input pin */ #define CCR0_FLUSH 0x10 /* Enables FLUSH# input pin */ #define CCR0_BARB 0x20 /* Flushes internal cache when entering hold state */ #define CCR0_CO 0x40 /* Cache org: 1=direct mapped, 0=2x set assoc */ #define CCR0_SUSPEND 0x80 /* Enables SUSP# and SUSPA# pins */ #define CCR1 0xc1 /* Configuration control register 1 */ #define CCR1_RPL 0x01 /* Enables RPLSET and RPLVAL# pins */ #define CCR1_SMI 0x02 /* Enables SMM pins */ #define CCR1_SMAC 0x04 /* System management memory access */ #define CCR1_MMAC 0x08 /* Main memory access */ #define CCR1_NO_LOCK 0x10 /* Negate LOCK# */ #define CCR1_SM3 0x80 /* SMM address space address region 3 */ #define CCR2 0xc2 #define CCR2_WB 0x02 /* Enables WB cache interface pins */ #define CCR2_SADS 0x02 /* Slow ADS */ #define CCR2_LOCK_NW 0x04 /* LOCK NW Bit */ #define CCR2_SUSP_HLT 0x08 /* Suspend on HALT */ #define CCR2_WT1 0x10 /* WT region 1 */ #define CCR2_WPR1 0x10 /* Write-protect region 1 */ #define CCR2_BARB 0x20 /* Flushes write-back cache when entering hold state. */ #define CCR2_BWRT 0x40 /* Enables burst write cycles */ #define CCR2_USE_SUSP 0x80 /* Enables suspend pins */ #define CCR3 0xc3 #define CCR3_SMILOCK 0x01 /* SMM register lock */ #define CCR3_NMI 0x02 /* Enables NMI during SMM */ #define CCR3_LINBRST 0x04 /* Linear address burst cycles */ #define CCR3_SMMMODE 0x08 /* SMM Mode */ #define CCR3_MAPEN0 0x10 /* Enables Map0 */ #define CCR3_MAPEN1 0x20 /* Enables Map1 */ #define CCR3_MAPEN2 0x40 /* Enables Map2 */ #define CCR3_MAPEN3 0x80 /* Enables Map3 */ #define CCR4 0xe8 #define CCR4_IOMASK 0x07 #define CCR4_MEM 0x08 /* Enables momory bypassing */ #define CCR4_DTE 0x10 /* Enables directory table entry cache */ #define CCR4_FASTFPE 0x20 /* Fast FPU exception */ #define CCR4_CPUID 0x80 /* Enables CPUID instruction */ #define CCR5 0xe9 #define CCR5_WT_ALLOC 0x01 /* Write-through allocate */ #define CCR5_SLOP 0x02 /* LOOP instruction slowed down */ #define CCR5_LBR1 0x10 /* Local bus region 1 */ #define CCR5_ARREN 0x20 /* Enables ARR region */ #define CCR6 0xea #define CCR7 0xeb /* Performance Control Register (5x86 only). */ #define PCR0 0x20 #define PCR0_RSTK 0x01 /* Enables return stack */ #define PCR0_BTB 0x02 /* Enables branch target buffer */ #define PCR0_LOOP 0x04 /* Enables loop */ #define PCR0_AIS 0x08 /* Enables all instrcutions stalled to serialize pipe. */ #define PCR0_MLR 0x10 /* Enables reordering of misaligned loads */ #define PCR0_BTBRT 0x40 /* Enables BTB test register. */ #define PCR0_LSSER 0x80 /* Disable reorder */ /* Device Identification Registers */ #define DIR0 0xfe #define DIR1 0xff /* * Machine Check register constants. */ #define MCG_CAP_COUNT 0x000000ff #define MCG_CAP_CTL_P 0x00000100 #define MCG_CAP_EXT_P 0x00000200 #define MCG_CAP_CMCI_P 0x00000400 #define MCG_CAP_TES_P 0x00000800 #define MCG_CAP_EXT_CNT 0x00ff0000 #define MCG_CAP_SER_P 0x01000000 #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 #define MCG_CTL_ENABLE 0xffffffffffffffff #define MCG_CTL_DISABLE 0x0000000000000000 #define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4) #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) #define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ #define MC_STATUS_MCA_ERROR 0x000000000000ffff #define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 #define MC_STATUS_OTHER_INFO 0x01ffffff00000000 #define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_CMCI_P */ #define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_PCC 0x0200000000000000 #define MC_STATUS_ADDRV 0x0400000000000000 #define MC_STATUS_MISCV 0x0800000000000000 #define MC_STATUS_EN 0x1000000000000000 #define MC_STATUS_UC 0x2000000000000000 #define MC_STATUS_OVER 0x4000000000000000 #define MC_STATUS_VAL 0x8000000000000000 #define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ #define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 #define MC_AMDNB_BANK 4 #define MC_MISC_AMD_VAL 0x8000000000000000 /* Counter presence valid */ #define MC_MISC_AMD_CNTP 0x4000000000000000 /* Counter present */ #define MC_MISC_AMD_LOCK 0x2000000000000000 /* Register locked */ #define MC_MISC_AMD_INTP 0x1000000000000000 /* Int. type can generate interrupts */ #define MC_MISC_AMD_LVT_MASK 0x00f0000000000000 /* Extended LVT offset */ #define MC_MISC_AMD_LVT_SHIFT 52 #define MC_MISC_AMD_CNTEN 0x0008000000000000 /* Counter enabled */ #define MC_MISC_AMD_INT_MASK 0x0006000000000000 /* Interrupt type */ #define MC_MISC_AMD_INT_LVT 0x0002000000000000 /* Interrupt via Extended LVT */ #define MC_MISC_AMD_INT_SMI 0x0004000000000000 /* SMI */ #define MC_MISC_AMD_OVERFLOW 0x0001000000000000 /* Counter overflow */ #define MC_MISC_AMD_CNT_MASK 0x00000fff00000000 /* Counter value */ #define MC_MISC_AMD_CNT_SHIFT 32 #define MC_MISC_AMD_CNT_MAX 0xfff #define MC_MISC_AMD_PTR_MASK 0x00000000ff000000 /* Pointer to additional registers */ #define MC_MISC_AMD_PTR_SHIFT 24 /* * The following four 3-byte registers control the non-cacheable regions. * These registers must be written as three separate bytes. * * NCRx+0: A31-A24 of starting address * NCRx+1: A23-A16 of starting address * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx. * * The non-cacheable region's starting address must be aligned to the * size indicated by the NCR_SIZE_xx field. */ #define NCR1 0xc4 #define NCR2 0xc7 #define NCR3 0xca #define NCR4 0xcd #define NCR_SIZE_0K 0 #define NCR_SIZE_4K 1 #define NCR_SIZE_8K 2 #define NCR_SIZE_16K 3 #define NCR_SIZE_32K 4 #define NCR_SIZE_64K 5 #define NCR_SIZE_128K 6 #define NCR_SIZE_256K 7 #define NCR_SIZE_512K 8 #define NCR_SIZE_1M 9 #define NCR_SIZE_2M 10 #define NCR_SIZE_4M 11 #define NCR_SIZE_8M 12 #define NCR_SIZE_16M 13 #define NCR_SIZE_32M 14 #define NCR_SIZE_4G 15 /* * The address region registers are used to specify the location and * size for the eight address regions. * * ARRx + 0: A31-A24 of start address * ARRx + 1: A23-A16 of start address * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx */ #define ARR0 0xc4 #define ARR1 0xc7 #define ARR2 0xca #define ARR3 0xcd #define ARR4 0xd0 #define ARR5 0xd3 #define ARR6 0xd6 #define ARR7 0xd9 #define ARR_SIZE_0K 0 #define ARR_SIZE_4K 1 #define ARR_SIZE_8K 2 #define ARR_SIZE_16K 3 #define ARR_SIZE_32K 4 #define ARR_SIZE_64K 5 #define ARR_SIZE_128K 6 #define ARR_SIZE_256K 7 #define ARR_SIZE_512K 8 #define ARR_SIZE_1M 9 #define ARR_SIZE_2M 10 #define ARR_SIZE_4M 11 #define ARR_SIZE_8M 12 #define ARR_SIZE_16M 13 #define ARR_SIZE_32M 14 #define ARR_SIZE_4G 15 /* * The region control registers specify the attributes associated with * the ARRx addres regions. */ #define RCR0 0xdc #define RCR1 0xdd #define RCR2 0xde #define RCR3 0xdf #define RCR4 0xe0 #define RCR5 0xe1 #define RCR6 0xe2 #define RCR7 0xe3 #define RCR_RCD 0x01 /* Disables caching for ARRx (x = 0-6). */ #define RCR_RCE 0x01 /* Enables caching for ARR7. */ #define RCR_WWO 0x02 /* Weak write ordering. */ #define RCR_WL 0x04 /* Weak locking. */ #define RCR_WG 0x08 /* Write gathering. */ #define RCR_WT 0x10 /* Write-through. */ #define RCR_NLB 0x20 /* LBA# pin is not asserted. */ /* AMD Write Allocate Top-Of-Memory and Control Register */ #define AMD_WT_ALLOC_TME 0x40000 /* top-of-memory enable */ #define AMD_WT_ALLOC_PRE 0x20000 /* programmable range enable */ #define AMD_WT_ALLOC_FRE 0x10000 /* fixed (A0000-FFFFF) range enable */ /* AMD64 MSR's */ #define MSR_EFER 0xc0000080 /* extended features */ #define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */ #define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */ #define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */ #define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ #define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ #define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ #define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ #define MSR_TSC_AUX 0xc0000103 #define MSR_PERFEVSEL0 0xc0010000 #define MSR_PERFEVSEL1 0xc0010001 #define MSR_PERFEVSEL2 0xc0010002 #define MSR_PERFEVSEL3 0xc0010003 #define MSR_K7_PERFCTR0 0xc0010004 #define MSR_K7_PERFCTR1 0xc0010005 #define MSR_K7_PERFCTR2 0xc0010006 #define MSR_K7_PERFCTR3 0xc0010007 #define MSR_SYSCFG 0xc0010010 #define MSR_HWCR 0xc0010015 #define MSR_IORRBASE0 0xc0010016 #define MSR_IORRMASK0 0xc0010017 #define MSR_IORRBASE1 0xc0010018 #define MSR_IORRMASK1 0xc0010019 #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ #define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ #define MSR_MC0_CTL_MASK 0xc0010044 #define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ #define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ #define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ #define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ #define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ #define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ #define MSR_VM_CR 0xc0010114 /* SVM: feature control */ #define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ #define MSR_AMD_CPUID07 0xc0011002 /* CPUID 07 %ebx override */ #define MSR_EXTFEATURES 0xc0011005 /* Extended CPUID Features override */ #define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ /* MSR_VM_CR related */ #define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ /* VIA ACE crypto featureset: for via_feature_rng */ #define VIA_HAS_RNG 1 /* cpu has RNG */ /* VIA ACE crypto featureset: for via_feature_xcrypt */ #define VIA_HAS_AES 1 /* cpu has AES */ #define VIA_HAS_SHA 2 /* cpu has SHA1 & SHA256 */ #define VIA_HAS_MM 4 /* cpu has RSA instructions */ #define VIA_HAS_AESCTR 8 /* cpu has AES-CTR instructions */ /* Centaur Extended Feature flags */ #define VIA_CPUID_HAS_RNG 0x000004 #define VIA_CPUID_DO_RNG 0x000008 #define VIA_CPUID_HAS_ACE 0x000040 #define VIA_CPUID_DO_ACE 0x000080 #define VIA_CPUID_HAS_ACE2 0x000100 #define VIA_CPUID_DO_ACE2 0x000200 #define VIA_CPUID_HAS_PHE 0x000400 #define VIA_CPUID_DO_PHE 0x000800 #define VIA_CPUID_HAS_PMM 0x001000 #define VIA_CPUID_DO_PMM 0x002000 /* VIA ACE xcrypt-* instruction context control options */ #define VIA_CRYPT_CWLO_ROUND_M 0x0000000f #define VIA_CRYPT_CWLO_ALG_M 0x00000070 #define VIA_CRYPT_CWLO_ALG_AES 0x00000000 #define VIA_CRYPT_CWLO_KEYGEN_M 0x00000080 #define VIA_CRYPT_CWLO_KEYGEN_HW 0x00000000 #define VIA_CRYPT_CWLO_KEYGEN_SW 0x00000080 #define VIA_CRYPT_CWLO_NORMAL 0x00000000 #define VIA_CRYPT_CWLO_INTERMEDIATE 0x00000100 #define VIA_CRYPT_CWLO_ENCRYPT 0x00000000 #define VIA_CRYPT_CWLO_DECRYPT 0x00000200 #define VIA_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ #define VIA_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ #define VIA_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ #endif /* !_MACHINE_SPECIALREG_H_ */ Index: projects/runtime-coverage-v2/sys/x86/include/x86_var.h =================================================================== --- projects/runtime-coverage-v2/sys/x86/include/x86_var.h (revision 347598) +++ projects/runtime-coverage-v2/sys/x86/include/x86_var.h (revision 347599) @@ -1,142 +1,144 @@ /*- * Copyright (c) 1995 Bruce D. Evans. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_X86_VAR_H_ #define _X86_X86_VAR_H_ /* * Miscellaneous machine-dependent declarations. */ extern long Maxmem; extern u_int basemem; extern int busdma_swi_pending; extern u_int cpu_exthigh; extern u_int cpu_feature; extern u_int cpu_feature2; extern u_int amd_feature; extern u_int amd_feature2; extern u_int amd_rascap; extern u_int amd_pminfo; extern u_int amd_extended_feature_extensions; extern u_int via_feature_rng; extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; extern u_int cpu_stdext_feature3; extern uint64_t cpu_ia32_arch_caps; extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; extern u_int cpu_mon_mwait_flags; extern u_int cpu_mon_min_size; extern u_int cpu_mon_max_size; extern u_int cpu_maxphyaddr; extern char ctx_switch_xsave[]; extern u_int hv_high; extern char hv_vendor[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode; extern int vm_page_dump_size; extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; extern u_int max_apic_id; extern int i386_read_exec; extern int pti; extern int hw_ibrs_active; +extern int hw_mds_disable; extern int hw_ssb_active; struct pcb; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; struct trapframe; /* * The interface type of the interrupt handler entry point cannot be * expressed in C. Use simplest non-variadic function type as an * approximation. */ typedef void alias_for_inthand_t(void); bool acpi_get_fadt_bootflags(uint16_t *flagsp); void *alloc_fpusave(int flags); void busdma_swi(void); vm_paddr_t cpu_getmaxphyaddr(void); bool cpu_mwait_usable(void); void cpu_probe_amdc1e(void); void cpu_setregs(void); bool disable_wp(void); void restore_wp(bool old_wp); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void finishidentcpu(void); void identify_cpu1(void); void identify_cpu2(void); void identify_cpu_fixup_bsp(void); void identify_hypervisor(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int is_physical_memory(vm_paddr_t addr); int isa_nmi(int cd); void handle_ibrs_entry(void); void handle_ibrs_exit(void); void hw_ibrs_recalculate(void); +void hw_mds_recalculate(void); void hw_ssb_recalculate(bool all_cpus); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); void nmi_call_kdb_smp(u_int type, struct trapframe *frame); void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); int pti_get_default(void); int user_dbreg_trap(register_t dr6); int minidumpsys(struct dumperinfo *); struct pcb *get_pcb_td(struct thread *td); #endif Index: projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c =================================================================== --- projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c (revision 347598) +++ projects/runtime-coverage-v2/sys/x86/x86/cpu_machdep.c (revision 347599) @@ -1,977 +1,1175 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #ifdef __i386__ #include "opt_apic.h" #endif #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef CPU_ELAN #include #endif #include #include #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 #ifdef SMP static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif /* * Automatically initialized per CPU errata in cpu_idle_tun below. */ bool mwait_cpustop_broken = false; SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN, &mwait_cpustop_broken, 0, "Can not reliably wake MONITOR/MWAIT cpus without interrupts"); /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } /* * Use mwait to pause execution while waiting for an interrupt or * another thread to signal that there is more work. * * NOTE: Interrupts will cause a wakeup; however, this function does * not enable interrupt handling. The caller is responsible to enable * interrupts. */ void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; uint64_t v; /* * A comment in Linux patch claims that 'CPUs run faster with * speculation protection disabled. All CPU threads in a core * must disable speculation protection for it to be * disabled. Disable it while we are idle so the other * hyperthread can run fast.' * * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = &PCPU_PTR(monitorbuf)->idle_state; KASSERT(atomic_load_int(state) == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); atomic_store_int(state, STATE_MWAIT); if (PCPU_GET(ibpb_set) || hw_ssb_active) { v = rdmsr(MSR_IA32_SPEC_CTRL); wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS | IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD)); } else { v = 0; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * SSB cannot be disabled while we sleep, or rather, if it was * disabled, the sysctl thread will bind to our cpu to tweak * MSR. */ if (v != 0) wrmsr(MSR_IA32_SPEC_CTRL, v); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ atomic_store_int(state, STATE_RUNNING); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } static void cpu_reset_real(void) { struct region_descriptor null_idt; int b; disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; #endif #ifdef __i386__ if (cpu == CPU_GEODE1100) { /* Attempt Geode's own reset */ outl(0xcf8, 0x80009044ul); outl(0xcfc, 0xf); } #endif #if !defined(BROKEN_KEYBOARD_RESET) /* * Attempt to do a CPU reset via the keyboard controller, * do not turn off GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ #endif /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } printf("No known reset method worked, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ /* Wipe the IDT. */ null_idt.rd_limit = 0; null_idt.rd_base = 0; lidt(&null_idt); /* "good night, sweet prince .... " */ breakpoint(); /* NOTREACHED */ while(1); } #ifdef SMP static void cpu_reset_proxy(void) { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ia32_pause(); /* Wait for other cpu to see that we've started */ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); cpu_reset_real(); } #endif void cpu_reset(void) { #ifdef SMP struct monitorbuf *mb; cpuset_t map; u_int cnt; if (smp_started) { map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &map); CPU_NAND(&map, &stopped_cpus); if (!CPU_EMPTY(&map)) { printf("cpu_reset: Stopping other CPUs\n"); stop_cpus(map); } if (PCPU_GET(cpuid) != 0) { cpu_reset_proxyid = PCPU_GET(cpuid); cpustop_restartfunc = cpu_reset_proxy; cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); /* Restart CPU #0. */ CPU_SETOF(0, &started_cpus); mb = &pcpu_find(0)->pc_monitorbuf; atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_RUNNING); wmb(); cnt = 0; while (cpu_reset_proxy_active == 0 && cnt < 10000000) { ia32_pause(); cnt++; /* Wait for BSP to announce restart */ } if (cpu_reset_proxy_active == 0) { printf("cpu_reset: Failed to restart BSP\n"); } else { cpu_reset_proxy_active = 2; while (1) ia32_pause(); /* NOTREACHED */ } } DELAY(1000000); } #endif cpu_reset_real(); /* NOTREACHED */ } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_MWAIT); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { atomic_store_int(state, STATE_RUNNING); enable_intr(); return; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_RUNNING); /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } static int cpu_idle_apl31_workaround; SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW, &cpu_idle_apl31_workaround, 0, "Apollo Lake APL31 MWAIT bug workaround"); int cpu_idle_wakeup(int cpu) { struct monitorbuf *mb; int *state; mb = &pcpu_find(cpu)->pc_monitorbuf; state = &mb->idle_state; switch (atomic_load_int(state)) { case STATE_SLEEPING: return (0); case STATE_MWAIT: atomic_store_int(state, STATE_RUNNING); return (cpu_idle_apl31_workaround ? 0 : 1); case STATE_RUNNING: return (1); default: panic("bad monitor state"); return (1); } } /* * Ordered by speed/power consumption. */ static struct { void *id_fn; char *id_name; int id_cpuid2_flag; } idle_tbl[] = { { .id_fn = cpu_idle_spin, .id_name = "spin" }, { .id_fn = cpu_idle_mwait, .id_name = "mwait", .id_cpuid2_flag = CPUID2_MON }, { .id_fn = cpu_idle_hlt, .id_name = "hlt" }, { .id_fn = cpu_idle_acpi, .id_name = "acpi" }, }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static bool cpu_idle_selector(const char *new_idle_name) { int i; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; if (strcmp(idle_tbl[i].id_name, new_idle_name)) continue; cpu_idle_fn = idle_tbl[i].id_fn; if (bootverbose) printf("CPU idle set to %s\n", idle_tbl[i].id_name); return (true); } return (false); } static int cpu_idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16], *p; int error, i; p = "unknown"; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); return (cpu_idle_selector(buf) ? 0 : EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, cpu_idle_sysctl, "A", "currently selected idle function"); static void cpu_idle_tun(void *unused __unused) { char tunvar[16]; if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar))) cpu_idle_selector(tunvar); else if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) { /* Ryzen erratas 1057, 1109. */ cpu_idle_selector("hlt"); idle_mwait = 0; mwait_cpustop_broken = true; } if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) { /* * Apollo Lake errata APL31 (public errata APL30). * Stores to the armed address range may not trigger * MWAIT to resume execution. OS needs to use * interrupts to wake processors from MWAIT-induced * sleep states. */ cpu_idle_apl31_workaround = 1; mwait_cpustop_broken = true; } TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); } SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL); static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI raised by hardware failure"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); #ifdef KDB int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, &kdb_on_nmi, 0, "Go to KDB on NMI with unknown source"); #endif void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { bool claimed = false; #ifdef DEV_ISA /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err)) { claimed = true; if (panic_on_nmi) panic("NMI indicates hardware failure"); } #endif /* DEV_ISA */ #ifdef KDB if (!claimed && kdb_on_nmi) { /* * NMI can be hooked up to a pushbutton for debugging. */ printf("NMI/cpu%d ... going to debugger\n", cpu); kdb_trap(type, 0, frame); } #endif /* KDB */ } void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif nmi_call_kdb(PCPU_GET(cpuid), type, frame); } int hw_ibrs_active; int hw_ibrs_disable = 1; SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); void hw_ibrs_recalculate(void) { uint64_t v; if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { if (hw_ibrs_disable) { v = rdmsr(MSR_IA32_SPEC_CTRL); v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS; wrmsr(MSR_IA32_SPEC_CTRL, v); } else { v = rdmsr(MSR_IA32_SPEC_CTRL); v |= IA32_SPEC_CTRL_IBRS; wrmsr(MSR_IA32_SPEC_CTRL, v); } return; } hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable; } static int hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ibrs_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ibrs_disable = val != 0; hw_ibrs_recalculate(); return (0); } SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); int hw_ssb_active; int hw_ssb_disable; SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); static void hw_ssb_set_one(bool enable) { uint64_t v; v = rdmsr(MSR_IA32_SPEC_CTRL); if (enable) v |= (uint64_t)IA32_SPEC_CTRL_SSBD; else v &= ~(uint64_t)IA32_SPEC_CTRL_SSBD; wrmsr(MSR_IA32_SPEC_CTRL, v); } static void hw_ssb_set(bool enable, bool for_all_cpus) { struct thread *td; int bound_cpu, i, is_bound; if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) { hw_ssb_active = 0; return; } hw_ssb_active = enable; if (for_all_cpus) { td = curthread; thread_lock(td); is_bound = sched_is_bound(td); bound_cpu = td->td_oncpu; CPU_FOREACH(i) { sched_bind(td, i); hw_ssb_set_one(enable); } if (is_bound) sched_bind(td, bound_cpu); else sched_unbind(td); thread_unlock(td); } else { hw_ssb_set_one(enable); } } void hw_ssb_recalculate(bool all_cpus) { switch (hw_ssb_disable) { default: hw_ssb_disable = 0; /* FALLTHROUGH */ case 0: /* off */ hw_ssb_set(false, all_cpus); break; case 1: /* on */ hw_ssb_set(true, all_cpus); break; case 2: /* auto */ hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ? false : true, all_cpus); break; } } static int hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ssb_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ssb_disable = val; hw_ssb_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto"); + +int hw_mds_disable; + +/* + * Handler for Microarchitectural Data Sampling issues. Really not a + * pointer to C function: on amd64 the code must not change any CPU + * architectural state except possibly %rflags. Also, it is always + * called with interrupts disabled. + */ +void (*mds_handler)(void); +void mds_handler_void(void); +void mds_handler_verw(void); +void mds_handler_ivb(void); +void mds_handler_bdw(void); +void mds_handler_skl_sse(void); +void mds_handler_skl_avx(void); +void mds_handler_skl_avx512(void); +void mds_handler_silvermont(void); + +static int +sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS) +{ + const char *state; + + if (mds_handler == mds_handler_void) + state = "inactive"; + else if (mds_handler == mds_handler_verw) + state = "VERW"; + else if (mds_handler == mds_handler_ivb) + state = "software IvyBridge"; + else if (mds_handler == mds_handler_bdw) + state = "software Broadwell"; + else if (mds_handler == mds_handler_skl_sse) + state = "software Skylake SSE"; + else if (mds_handler == mds_handler_skl_avx) + state = "software Skylake AVX"; + else if (mds_handler == mds_handler_skl_avx512) + state = "software Skylake AVX512"; + else if (mds_handler == mds_handler_silvermont) + state = "software Silvermont"; + else + state = "unknown"; + return (SYSCTL_OUT(req, state, strlen(state))); +} + +SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_hw_mds_disable_state_handler, "A", + "Microarchitectural Data Sampling Mitigation state"); + +_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512"); + +void +hw_mds_recalculate(void) +{ + struct pcpu *pc; + vm_offset_t b64; + u_long xcr0; + int i; + + /* + * Allow user to force VERW variant even if MD_CLEAR is not + * reported. For instance, hypervisor might unknowingly + * filter the cap out. + * For the similar reasons, and for testing, allow to enable + * mitigation even for RDCL_NO or MDS_NO caps. + */ + if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 || + ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO | + IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) { + mds_handler = mds_handler_void; + } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 && + hw_mds_disable == 3) || hw_mds_disable == 1) { + mds_handler = mds_handler_verw; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e || + CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a || + CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 || + CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d || + CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e || + CPUID_TO_MODEL(cpu_id) == 0x3a) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Nehalem, SandyBridge, IvyBridge + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc_domainset(672, M_TEMP, + DOMAINSET_PREF(pc->pc_domain), M_WAITOK); + bzero(pc->pc_mds_buf, 16); + } + } + mds_handler = mds_handler_ivb; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c || + CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 || + CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f || + CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Haswell, Broadwell + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc_domainset(1536, M_TEMP, + DOMAINSET_PREF(pc->pc_domain), M_WAITOK); + bzero(pc->pc_mds_buf, 16); + } + } + mds_handler = mds_handler_bdw; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id & + CPUID_STEPPING) <= 5) || + CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e || + (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id & + CPUID_STEPPING) <= 0xb) || + (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id & + CPUID_STEPPING) <= 0xc)) && + (hw_mds_disable == 2 || hw_mds_disable == 3)) { + /* + * Skylake, KabyLake, CoffeeLake, WhiskeyLake, + * CascadeLake + */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) { + pc->pc_mds_buf = malloc_domainset(6 * 1024, + M_TEMP, DOMAINSET_PREF(pc->pc_domain), + M_WAITOK); + b64 = (vm_offset_t)malloc_domainset(64 + 63, + M_TEMP, DOMAINSET_PREF(pc->pc_domain), + M_WAITOK); + pc->pc_mds_buf64 = (void *)roundup2(b64, 64); + bzero(pc->pc_mds_buf64, 64); + } + } + xcr0 = rxcr(0); + if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 && + (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0) + mds_handler = mds_handler_skl_avx512; + else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 && + (cpu_feature2 & CPUID2_AVX) != 0) + mds_handler = mds_handler_skl_avx; + else + mds_handler = mds_handler_skl_sse; + } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && + ((CPUID_TO_MODEL(cpu_id) == 0x37 || + CPUID_TO_MODEL(cpu_id) == 0x4a || + CPUID_TO_MODEL(cpu_id) == 0x4c || + CPUID_TO_MODEL(cpu_id) == 0x4d || + CPUID_TO_MODEL(cpu_id) == 0x5a || + CPUID_TO_MODEL(cpu_id) == 0x5d || + CPUID_TO_MODEL(cpu_id) == 0x6e || + CPUID_TO_MODEL(cpu_id) == 0x65 || + CPUID_TO_MODEL(cpu_id) == 0x75 || + CPUID_TO_MODEL(cpu_id) == 0x1c || + CPUID_TO_MODEL(cpu_id) == 0x26 || + CPUID_TO_MODEL(cpu_id) == 0x27 || + CPUID_TO_MODEL(cpu_id) == 0x35 || + CPUID_TO_MODEL(cpu_id) == 0x36 || + CPUID_TO_MODEL(cpu_id) == 0x7a))) { + /* Silvermont, Airmont */ + CPU_FOREACH(i) { + pc = pcpu_find(i); + if (pc->pc_mds_buf == NULL) + pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK); + } + mds_handler = mds_handler_silvermont; + } else { + hw_mds_disable = 0; + mds_handler = mds_handler_void; + } +} + +static int +sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = hw_mds_disable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (val < 0 || val > 3) + return (EINVAL); + hw_mds_disable = val; + hw_mds_recalculate(); + return (0); +} + +SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT | + CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, + sysctl_mds_disable_handler, "I", + "Microarchitectural Data Sampling Mitigation " + "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO"); /* * Enable and restore kernel text write permissions. * Callers must ensure that disable_wp()/restore_wp() are executed * without rescheduling on the same core. */ bool disable_wp(void) { u_int cr0; cr0 = rcr0(); if ((cr0 & CR0_WP) == 0) return (false); load_cr0(cr0 & ~CR0_WP); return (true); } void restore_wp(bool old_wp) { if (old_wp) load_cr0(rcr0() | CR0_WP); } bool acpi_get_fadt_bootflags(uint16_t *flagsp) { #ifdef DEV_ACPI ACPI_TABLE_FADT *fadt; vm_paddr_t physaddr; physaddr = acpi_find_table(ACPI_SIG_FADT); if (physaddr == 0) return (false); fadt = acpi_map_table(physaddr, ACPI_SIG_FADT); if (fadt == NULL) return (false); *flagsp = fadt->BootFlags; acpi_unmap_table(fadt); return (true); #else return (false); #endif } Index: projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c =================================================================== --- projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c (revision 347598) +++ projects/runtime-coverage-v2/usr.sbin/mountd/mountd.c (revision 347599) @@ -1,3340 +1,3337 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Herb Hasler and Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1989, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /*not lint*/ #if 0 #ifndef lint static char sccsid[] = "@(#)mountd.c 8.15 (Berkeley) 5/1/95"; #endif /*not lint*/ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pathnames.h" #include "mntopts.h" #ifdef DEBUG #include #endif /* * Structures for keeping the mount list and export list */ struct mountlist { char ml_host[MNTNAMLEN+1]; char ml_dirp[MNTPATHLEN+1]; SLIST_ENTRY(mountlist) next; }; struct dirlist { struct dirlist *dp_left; struct dirlist *dp_right; int dp_flag; struct hostlist *dp_hosts; /* List of hosts this dir exported to */ char *dp_dirp; }; /* dp_flag bits */ #define DP_DEFSET 0x1 #define DP_HOSTSET 0x2 struct exportlist { struct dirlist *ex_dirl; struct dirlist *ex_defdir; + struct grouplist *ex_grphead; int ex_flag; fsid_t ex_fs; char *ex_fsdir; char *ex_indexfile; int ex_numsecflavors; int ex_secflavors[MAXSECFLAVORS]; int ex_defnumsecflavors; int ex_defsecflavors[MAXSECFLAVORS]; SLIST_ENTRY(exportlist) entries; }; /* ex_flag bits */ #define EX_LINKED 0x1 SLIST_HEAD(exportlisthead, exportlist); struct netmsk { struct sockaddr_storage nt_net; struct sockaddr_storage nt_mask; char *nt_name; }; union grouptypes { struct addrinfo *gt_addrinfo; struct netmsk gt_net; }; struct grouplist { int gr_type; union grouptypes gr_ptr; struct grouplist *gr_next; int gr_numsecflavors; int gr_secflavors[MAXSECFLAVORS]; }; /* Group types */ #define GT_NULL 0x0 #define GT_HOST 0x1 #define GT_NET 0x2 #define GT_DEFAULT 0x3 #define GT_IGNORE 0x5 struct hostlist { int ht_flag; /* Uses DP_xx bits */ struct grouplist *ht_grp; struct hostlist *ht_next; }; struct fhreturn { int fhr_flag; int fhr_vers; nfsfh_t fhr_fh; int fhr_numsecflavors; int *fhr_secflavors; }; #define GETPORT_MAXTRY 20 /* Max tries to get a port # */ /* Global defs */ static char *add_expdir(struct dirlist **, char *, int); static void add_dlist(struct dirlist **, struct dirlist *, struct grouplist *, int, struct exportlist *); static void add_mlist(char *, char *); static int check_dirpath(char *); static int check_options(struct dirlist *); static int checkmask(struct sockaddr *sa); static int chk_host(struct dirlist *, struct sockaddr *, int *, int *, int *, int **); static char *strsep_quote(char **stringp, const char *delim); static int create_service(struct netconfig *nconf); static void complete_service(struct netconfig *nconf, char *port_str); static void clearout_service(void); static void del_mlist(char *hostp, char *dirp); static struct dirlist *dirp_search(struct dirlist *, char *); static int do_mount(struct exportlist *, struct grouplist *, int, struct xucred *, char *, int, struct statfs *); static int do_opt(char **, char **, struct exportlist *, struct grouplist *, int *, int *, struct xucred *); static struct exportlist *ex_search(fsid_t *, struct exportlisthead *); static struct exportlist *get_exp(void); static void free_dir(struct dirlist *); static void free_exp(struct exportlist *); static void free_grp(struct grouplist *); static void free_host(struct hostlist *); static void get_exportlist(void); static void insert_exports(struct exportlist *, struct exportlisthead *); static void free_exports(struct exportlisthead *); static void read_exportfile(void); static void delete_export(struct iovec *, int, struct statfs *, char *); static int get_host(char *, struct grouplist *, struct grouplist *); static struct hostlist *get_ht(void); static int get_line(void); static void get_mountlist(void); static int get_net(char *, struct netmsk *, int); static void getexp_err(struct exportlist *, struct grouplist *, const char *); static struct grouplist *get_grp(void); static void hang_dirp(struct dirlist *, struct grouplist *, struct exportlist *, int); static void huphandler(int sig); static int makemask(struct sockaddr_storage *ssp, int bitlen); static void mntsrv(struct svc_req *, SVCXPRT *); static void nextfield(char **, char **); static void out_of_mem(void); static void parsecred(char *, struct xucred *); static int parsesec(char *, struct exportlist *); static int put_exlist(struct dirlist *, XDR *, struct dirlist *, int *, int); static void *sa_rawaddr(struct sockaddr *sa, int *nbytes); static int sacmp(struct sockaddr *sa1, struct sockaddr *sa2, struct sockaddr *samask); static int scan_tree(struct dirlist *, struct sockaddr *); static void usage(void); static int xdr_dir(XDR *, char *); static int xdr_explist(XDR *, caddr_t); static int xdr_explist_brief(XDR *, caddr_t); static int xdr_explist_common(XDR *, caddr_t, int); static int xdr_fhs(XDR *, caddr_t); static int xdr_mlist(XDR *, caddr_t); static void terminate(int); static struct exportlisthead exphead = SLIST_HEAD_INITIALIZER(&exphead); static SLIST_HEAD(, mountlist) mlhead = SLIST_HEAD_INITIALIZER(&mlhead); -static struct grouplist *grphead; static char *exnames_default[2] = { _PATH_EXPORTS, NULL }; static char **exnames; static char **hosts = NULL; static struct xucred def_anon = { XUCRED_VERSION, (uid_t)65534, 1, { (gid_t)65533 }, NULL }; static int force_v2 = 0; static int resvport_only = 1; static int nhosts = 0; static int dir_only = 1; static int dolog = 0; static int got_sighup = 0; static int xcreated = 0; static char *svcport_str = NULL; static int mallocd_svcport = 0; static int *sock_fd; static int sock_fdcnt; static int sock_fdpos; static int suspend_nfsd = 0; static int opt_flags; static int have_v6 = 1; static int v4root_phase = 0; static char v4root_dirpath[PATH_MAX + 1]; static int has_publicfh = 0; static struct pidfh *pfh = NULL; /* Bits for opt_flags above */ #define OP_MAPROOT 0x01 #define OP_MAPALL 0x02 /* 0x4 free */ #define OP_MASK 0x08 #define OP_NET 0x10 #define OP_ALLDIRS 0x40 #define OP_HAVEMASK 0x80 /* A mask was specified or inferred. */ #define OP_QUIET 0x100 #define OP_MASKLEN 0x200 #define OP_SEC 0x400 #ifdef DEBUG static int debug = 1; static void SYSLOG(int, const char *, ...) __printflike(2, 3); #define syslog SYSLOG #else static int debug = 0; #endif /* * Similar to strsep(), but it allows for quoted strings * and escaped characters. * * It returns the string (or NULL, if *stringp is NULL), * which is a de-quoted version of the string if necessary. * * It modifies *stringp in place. */ static char * strsep_quote(char **stringp, const char *delim) { char *srcptr, *dstptr, *retval; char quot = 0; if (stringp == NULL || *stringp == NULL) return (NULL); srcptr = dstptr = retval = *stringp; while (*srcptr) { /* * We're looking for several edge cases here. * First: if we're in quote state (quot != 0), * then we ignore the delim characters, but otherwise * process as normal, unless it is the quote character. * Second: if the current character is a backslash, * we take the next character as-is, without checking * for delim, quote, or backslash. Exception: if the * next character is a NUL, that's the end of the string. * Third: if the character is a quote character, we toggle * quote state. * Otherwise: check the current character for NUL, or * being in delim, and end the string if either is true. */ if (*srcptr == '\\') { srcptr++; /* * The edge case here is if the next character * is NUL, we want to stop processing. But if * it's not NUL, then we simply want to copy it. */ if (*srcptr) { *dstptr++ = *srcptr++; } continue; } if (quot == 0 && (*srcptr == '\'' || *srcptr == '"')) { quot = *srcptr++; continue; } if (quot && *srcptr == quot) { /* End of the quoted part */ quot = 0; srcptr++; continue; } if (!quot && strchr(delim, *srcptr)) break; *dstptr++ = *srcptr++; } *dstptr = 0; /* Terminate the string */ *stringp = (*srcptr == '\0') ? NULL : srcptr + 1; return (retval); } /* * Mountd server for NFS mount protocol as described in: * NFS: Network File System Protocol Specification, RFC1094, Appendix A * The optional arguments are the exports file name * default: _PATH_EXPORTS * and "-n" to allow nonroot mount. */ int main(int argc, char **argv) { fd_set readfds; struct netconfig *nconf; char *endptr, **hosts_bak; void *nc_handle; pid_t otherpid; in_port_t svcport; int c, k, s; int maxrec = RPC_MAXDATASIZE; int attempt_cnt, port_len, port_pos, ret; char **port_list; /* Check that another mountd isn't already running. */ pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &otherpid); if (pfh == NULL) { if (errno == EEXIST) errx(1, "mountd already running, pid: %d.", otherpid); warn("cannot open or create pidfile"); } s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (s < 0) have_v6 = 0; else close(s); while ((c = getopt(argc, argv, "2deh:lnp:rS")) != -1) switch (c) { case '2': force_v2 = 1; break; case 'e': /* now a no-op, since this is the default */ break; case 'n': resvport_only = 0; break; case 'r': dir_only = 0; break; case 'd': debug = debug ? 0 : 1; break; case 'l': dolog = 1; break; case 'p': endptr = NULL; svcport = (in_port_t)strtoul(optarg, &endptr, 10); if (endptr == NULL || *endptr != '\0' || svcport == 0 || svcport >= IPPORT_MAX) usage(); svcport_str = strdup(optarg); break; case 'h': ++nhosts; hosts_bak = hosts; hosts_bak = realloc(hosts, nhosts * sizeof(char *)); if (hosts_bak == NULL) { if (hosts != NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } } hosts = hosts_bak; hosts[nhosts - 1] = strdup(optarg); if (hosts[nhosts - 1] == NULL) { for (k = 0; k < (nhosts - 1); k++) free(hosts[k]); free(hosts); out_of_mem(); } break; case 'S': suspend_nfsd = 1; break; default: usage(); } if (modfind("nfsd") < 0) { /* Not present in kernel, try loading it */ if (kldload("nfsd") < 0 || modfind("nfsd") < 0) errx(1, "NFS server is not available"); } argc -= optind; argv += optind; - grphead = (struct grouplist *)NULL; if (argc > 0) exnames = argv; else exnames = exnames_default; openlog("mountd", LOG_PID, LOG_DAEMON); if (debug) warnx("getting export list"); get_exportlist(); if (debug) warnx("getting mount list"); get_mountlist(); if (debug) warnx("here we go"); if (debug == 0) { daemon(0, 0); signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); } signal(SIGHUP, huphandler); signal(SIGTERM, terminate); signal(SIGPIPE, SIG_IGN); pidfile_write(pfh); rpcb_unset(MOUNTPROG, MOUNTVERS, NULL); rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL); rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec); if (!resvport_only) { if (sysctlbyname("vfs.nfsd.nfs_privport", NULL, NULL, &resvport_only, sizeof(resvport_only)) != 0 && errno != ENOENT) { syslog(LOG_ERR, "sysctl: %m"); exit(1); } } /* * If no hosts were specified, add a wildcard entry to bind to * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the * list. */ if (nhosts == 0) { hosts = malloc(sizeof(char *)); if (hosts == NULL) out_of_mem(); hosts[0] = "*"; nhosts = 1; } else { hosts_bak = hosts; if (have_v6) { hosts_bak = realloc(hosts, (nhosts + 2) * sizeof(char *)); if (hosts_bak == NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } else hosts = hosts_bak; nhosts += 2; hosts[nhosts - 2] = "::1"; } else { hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *)); if (hosts_bak == NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } else { nhosts += 1; hosts = hosts_bak; } } hosts[nhosts - 1] = "127.0.0.1"; } attempt_cnt = 1; sock_fdcnt = 0; sock_fd = NULL; port_list = NULL; port_len = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { if (nconf->nc_flag & NC_VISIBLE) { if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else { ret = create_service(nconf); if (ret == 1) /* Ignore this call */ continue; if (ret < 0) { /* * Failed to bind port, so close off * all sockets created and try again * if the port# was dynamically * assigned via bind(2). */ clearout_service(); if (mallocd_svcport != 0 && attempt_cnt < GETPORT_MAXTRY) { free(svcport_str); svcport_str = NULL; mallocd_svcport = 0; } else { errno = EADDRINUSE; syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } /* Start over at the first service. */ free(sock_fd); sock_fdcnt = 0; sock_fd = NULL; nc_handle = setnetconfig(); attempt_cnt++; } else if (mallocd_svcport != 0 && attempt_cnt == GETPORT_MAXTRY) { /* * For the last attempt, allow * different port #s for each nconf * by saving the svcport_str and * setting it back to NULL. */ port_list = realloc(port_list, (port_len + 1) * sizeof(char *)); if (port_list == NULL) out_of_mem(); port_list[port_len++] = svcport_str; svcport_str = NULL; mallocd_svcport = 0; } } } } /* * Successfully bound the ports, so call complete_service() to * do the rest of the setup on the service(s). */ sock_fdpos = 0; port_pos = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { if (nconf->nc_flag & NC_VISIBLE) { if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else if (port_list != NULL) { if (port_pos >= port_len) { syslog(LOG_ERR, "too many port#s"); exit(1); } complete_service(nconf, port_list[port_pos++]); } else complete_service(nconf, svcport_str); } } endnetconfig(nc_handle); free(sock_fd); if (port_list != NULL) { for (port_pos = 0; port_pos < port_len; port_pos++) free(port_list[port_pos]); free(port_list); } if (xcreated == 0) { syslog(LOG_ERR, "could not create any services"); exit(1); } /* Expand svc_run() here so that we can call get_exportlist(). */ for (;;) { if (got_sighup) { get_exportlist(); got_sighup = 0; } readfds = svc_fdset; switch (select(svc_maxfd + 1, &readfds, NULL, NULL, NULL)) { case -1: if (errno == EINTR) continue; syslog(LOG_ERR, "mountd died: select: %m"); exit(1); case 0: continue; default: svc_getreqset(&readfds); } } } /* * This routine creates and binds sockets on the appropriate * addresses. It gets called one time for each transport. * It returns 0 upon success, 1 for ingore the call and -1 to indicate * bind failed with EADDRINUSE. * Any file descriptors that have been created are stored in sock_fd and * the total count of them is maintained in sock_fdcnt. */ static int create_service(struct netconfig *nconf) { struct addrinfo hints, *res = NULL; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct __rpc_sockinfo si; int aicode; int fd; int nhostsbak; int one = 1; int r; u_int32_t host_addr[4]; /* IPv4 or IPv6 */ int mallocd_res; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return (1); /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return (1); } /* Get mountd's address on this transport */ memset(&hints, 0, sizeof hints); hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; /* * Bind to specific IPs if asked to */ nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int)); if (sock_fd == NULL) out_of_mem(); sock_fd[sock_fdcnt++] = -1; /* Set invalid for now. */ mallocd_res = 0; hints.ai_flags = AI_PASSIVE; /* * XXX - using RPC library internal functions. */ if ((fd = __rpc_nconf2fd(nconf)) < 0) { int non_fatal = 0; if (errno == EAFNOSUPPORT && nconf->nc_semantics != NC_TPI_CLTS) non_fatal = 1; syslog(non_fatal ? LOG_DEBUG : LOG_ERR, "cannot create socket for %s", nconf->nc_netid); if (non_fatal != 0) continue; exit(1); } switch (hints.ai_family) { case AF_INET: if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET6 address. */ if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } break; case AF_INET6: if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET address. */ if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } /* * We're doing host-based access checks here, so don't * allow v4-in-v6 to confuse things. The kernel will * disable it by default on NFS sockets too. */ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof one) < 0) { syslog(LOG_ERR, "can't disable v4-in-v6 on IPv6 socket"); exit(1); } break; default: break; } /* * If no hosts were specified, just bind to INADDR_ANY */ if (strcmp("*", hosts[nhostsbak]) == 0) { if (svcport_str == NULL) { res = malloc(sizeof(struct addrinfo)); if (res == NULL) out_of_mem(); mallocd_res = 1; res->ai_flags = hints.ai_flags; res->ai_family = hints.ai_family; res->ai_protocol = hints.ai_protocol; switch (res->ai_family) { case AF_INET: sin = malloc(sizeof(struct sockaddr_in)); if (sin == NULL) out_of_mem(); sin->sin_family = AF_INET; sin->sin_port = htons(0); sin->sin_addr.s_addr = htonl(INADDR_ANY); res->ai_addr = (struct sockaddr*) sin; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in); break; case AF_INET6: sin6 = malloc(sizeof(struct sockaddr_in6)); if (sin6 == NULL) out_of_mem(); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(0); sin6->sin6_addr = in6addr_any; res->ai_addr = (struct sockaddr*) sin6; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in6); break; default: syslog(LOG_ERR, "bad addr fam %d", res->ai_family); exit(1); } } else { if ((aicode = getaddrinfo(NULL, svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } } else { if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } /* Store the fd. */ sock_fd[sock_fdcnt - 1] = fd; /* Now, attempt the bind. */ r = bindresvport_sa(fd, res->ai_addr); if (r != 0) { if (errno == EADDRINUSE && mallocd_svcport != 0) { if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); return (-1); } syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } if (svcport_str == NULL) { svcport_str = malloc(NI_MAXSERV * sizeof(char)); if (svcport_str == NULL) out_of_mem(); mallocd_svcport = 1; if (getnameinfo(res->ai_addr, res->ai_addr->sa_len, NULL, NI_MAXHOST, svcport_str, NI_MAXSERV * sizeof(char), NI_NUMERICHOST | NI_NUMERICSERV)) errx(1, "Cannot get port number"); } if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); res = NULL; } return (0); } /* * Called after all the create_service() calls have succeeded, to complete * the setup and registration. */ static void complete_service(struct netconfig *nconf, char *port_str) { struct addrinfo hints, *res = NULL; struct __rpc_sockinfo si; struct netbuf servaddr; SVCXPRT *transp = NULL; int aicode, fd, nhostsbak; int registered = 0; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return; /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return; } nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; if (sock_fdpos >= sock_fdcnt) { /* Should never happen. */ syslog(LOG_ERR, "Ran out of socket fd's"); return; } fd = sock_fd[sock_fdpos++]; if (fd < 0) continue; /* * Using -1 tells listen(2) to use * kern.ipc.soacceptqueue for the backlog. */ if (nconf->nc_semantics != NC_TPI_CLTS) listen(fd, -1); if (nconf->nc_semantics == NC_TPI_CLTS ) transp = svc_dg_create(fd, 0, 0); else transp = svc_vc_create(fd, RPC_MAXDATASIZE, RPC_MAXDATASIZE); if (transp != (SVCXPRT *) NULL) { if (!svc_reg(transp, MOUNTPROG, MOUNTVERS, mntsrv, NULL)) syslog(LOG_ERR, "can't register %s MOUNTVERS service", nconf->nc_netid); if (!force_v2) { if (!svc_reg(transp, MOUNTPROG, MOUNTVERS3, mntsrv, NULL)) syslog(LOG_ERR, "can't register %s MOUNTVERS3 service", nconf->nc_netid); } } else syslog(LOG_WARNING, "can't create %s services", nconf->nc_netid); if (registered == 0) { registered = 1; memset(&hints, 0, sizeof hints); hints.ai_flags = AI_PASSIVE; hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; if ((aicode = getaddrinfo(NULL, port_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address: %s", gai_strerror(aicode)); exit(1); } servaddr.buf = malloc(res->ai_addrlen); memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen); servaddr.len = res->ai_addrlen; rpcb_set(MOUNTPROG, MOUNTVERS, nconf, &servaddr); rpcb_set(MOUNTPROG, MOUNTVERS3, nconf, &servaddr); xcreated++; freeaddrinfo(res); } } /* end while */ } /* * Clear out sockets after a failure to bind one of them, so that the * cycle of socket creation/binding can start anew. */ static void clearout_service(void) { int i; for (i = 0; i < sock_fdcnt; i++) { if (sock_fd[i] >= 0) { shutdown(sock_fd[i], SHUT_RDWR); close(sock_fd[i]); } } } static void usage(void) { fprintf(stderr, "usage: mountd [-2] [-d] [-e] [-l] [-n] [-p ] [-r] " "[-S] [-h ] [export_file ...]\n"); exit(1); } /* * The mount rpc service */ void mntsrv(struct svc_req *rqstp, SVCXPRT *transp) { struct exportlist *ep; struct dirlist *dp; struct fhreturn fhr; struct stat stb; struct statfs fsb; char host[NI_MAXHOST], numerichost[NI_MAXHOST]; int lookup_failed = 1; struct sockaddr *saddr; u_short sport; char rpcpath[MNTPATHLEN + 1], dirpath[MAXPATHLEN]; int bad = 0, defset, hostset; sigset_t sighup_mask; int numsecflavors, *secflavorsp; sigemptyset(&sighup_mask); sigaddset(&sighup_mask, SIGHUP); saddr = svc_getrpccaller(transp)->buf; switch (saddr->sa_family) { case AF_INET6: sport = ntohs(((struct sockaddr_in6 *)saddr)->sin6_port); break; case AF_INET: sport = ntohs(((struct sockaddr_in *)saddr)->sin_port); break; default: syslog(LOG_ERR, "request from unknown address family"); return; } switch (rqstp->rq_proc) { case MOUNTPROC_MNT: case MOUNTPROC_UMNT: case MOUNTPROC_UMNTALL: lookup_failed = getnameinfo(saddr, saddr->sa_len, host, sizeof host, NULL, 0, 0); } getnameinfo(saddr, saddr->sa_len, numerichost, sizeof numerichost, NULL, 0, NI_NUMERICHOST); switch (rqstp->rq_proc) { case NULLPROC: if (!svc_sendreply(transp, (xdrproc_t)xdr_void, NULL)) syslog(LOG_ERR, "can't send reply"); return; case MOUNTPROC_MNT: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "mount request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) { syslog(LOG_NOTICE, "undecodable mount request from %s", numerichost); svcerr_decode(transp); return; } /* * Get the real pathname and make sure it is a directory * or a regular file if the -r option was specified * and it exists. */ if (realpath(rpcpath, dirpath) == NULL || stat(dirpath, &stb) < 0 || statfs(dirpath, &fsb) < 0) { chdir("/"); /* Just in case realpath doesn't */ syslog(LOG_NOTICE, "mount request from %s for non existent path %s", numerichost, dirpath); if (debug) warnx("stat failed on %s", dirpath); bad = ENOENT; /* We will send error reply later */ } if (!bad && !S_ISDIR(stb.st_mode) && (dir_only || !S_ISREG(stb.st_mode))) { syslog(LOG_NOTICE, "mount request from %s for non-directory path %s", numerichost, dirpath); if (debug) warnx("mounting non-directory %s", dirpath); bad = ENOTDIR; /* We will send error reply later */ } /* Check in the exports list */ sigprocmask(SIG_BLOCK, &sighup_mask, NULL); if (bad) ep = NULL; else ep = ex_search(&fsb.f_fsid, &exphead); hostset = defset = 0; if (ep && (chk_host(ep->ex_defdir, saddr, &defset, &hostset, &numsecflavors, &secflavorsp) || ((dp = dirp_search(ep->ex_dirl, dirpath)) && chk_host(dp, saddr, &defset, &hostset, &numsecflavors, &secflavorsp)) || (defset && scan_tree(ep->ex_defdir, saddr) == 0 && scan_tree(ep->ex_dirl, saddr) == 0))) { if (bad) { if (!svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; } if (hostset & DP_HOSTSET) { fhr.fhr_flag = hostset; fhr.fhr_numsecflavors = numsecflavors; fhr.fhr_secflavors = secflavorsp; } else { fhr.fhr_flag = defset; fhr.fhr_numsecflavors = ep->ex_defnumsecflavors; fhr.fhr_secflavors = ep->ex_defsecflavors; } fhr.fhr_vers = rqstp->rq_vers; /* Get the file handle */ memset(&fhr.fhr_fh, 0, sizeof(nfsfh_t)); if (getfh(dirpath, (fhandle_t *)&fhr.fhr_fh) < 0) { bad = errno; syslog(LOG_ERR, "can't get fh for %s", dirpath); if (!svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; } if (!svc_sendreply(transp, (xdrproc_t)xdr_fhs, (caddr_t)&fhr)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) add_mlist(host, dirpath); else add_mlist(numerichost, dirpath); if (debug) warnx("mount successful"); if (dolog) syslog(LOG_NOTICE, "mount request succeeded from %s for %s", numerichost, dirpath); } else { if (!bad) bad = EACCES; syslog(LOG_NOTICE, "mount request denied from %s for %s", numerichost, dirpath); } if (bad && !svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; case MOUNTPROC_DUMP: if (!svc_sendreply(transp, (xdrproc_t)xdr_mlist, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); else if (dolog) syslog(LOG_NOTICE, "dump request succeeded from %s", numerichost); return; case MOUNTPROC_UMNT: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "umount request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) { syslog(LOG_NOTICE, "undecodable umount request from %s", numerichost); svcerr_decode(transp); return; } if (realpath(rpcpath, dirpath) == NULL) { syslog(LOG_NOTICE, "umount request from %s " "for non existent path %s", numerichost, dirpath); } if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) del_mlist(host, dirpath); del_mlist(numerichost, dirpath); if (dolog) syslog(LOG_NOTICE, "umount request succeeded from %s for %s", numerichost, dirpath); return; case MOUNTPROC_UMNTALL: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "umountall request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) del_mlist(host, NULL); del_mlist(numerichost, NULL); if (dolog) syslog(LOG_NOTICE, "umountall request succeeded from %s", numerichost); return; case MOUNTPROC_EXPORT: if (!svc_sendreply(transp, (xdrproc_t)xdr_explist, (caddr_t)NULL)) if (!svc_sendreply(transp, (xdrproc_t)xdr_explist_brief, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (dolog) syslog(LOG_NOTICE, "export request succeeded from %s", numerichost); return; default: svcerr_noproc(transp); return; } } /* * Xdr conversion for a dirpath string */ static int xdr_dir(XDR *xdrsp, char *dirp) { return (xdr_string(xdrsp, &dirp, MNTPATHLEN)); } /* * Xdr routine to generate file handle reply */ static int xdr_fhs(XDR *xdrsp, caddr_t cp) { struct fhreturn *fhrp = (struct fhreturn *)cp; u_long ok = 0, len, auth; int i; if (!xdr_long(xdrsp, &ok)) return (0); switch (fhrp->fhr_vers) { case 1: return (xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, NFSX_V2FH)); case 3: len = NFSX_V3FH; if (!xdr_long(xdrsp, &len)) return (0); if (!xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, len)) return (0); if (fhrp->fhr_numsecflavors) { if (!xdr_int(xdrsp, &fhrp->fhr_numsecflavors)) return (0); for (i = 0; i < fhrp->fhr_numsecflavors; i++) if (!xdr_int(xdrsp, &fhrp->fhr_secflavors[i])) return (0); return (1); } else { auth = AUTH_SYS; len = 1; if (!xdr_long(xdrsp, &len)) return (0); return (xdr_long(xdrsp, &auth)); } } return (0); } static int xdr_mlist(XDR *xdrsp, caddr_t cp __unused) { struct mountlist *mlp; int true = 1; int false = 0; char *strp; SLIST_FOREACH(mlp, &mlhead, next) { if (!xdr_bool(xdrsp, &true)) return (0); strp = &mlp->ml_host[0]; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (0); strp = &mlp->ml_dirp[0]; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (0); } if (!xdr_bool(xdrsp, &false)) return (0); return (1); } /* * Xdr conversion for export list */ static int xdr_explist_common(XDR *xdrsp, caddr_t cp __unused, int brief) { struct exportlist *ep; int false = 0; int putdef; sigset_t sighup_mask; sigemptyset(&sighup_mask); sigaddset(&sighup_mask, SIGHUP); sigprocmask(SIG_BLOCK, &sighup_mask, NULL); SLIST_FOREACH(ep, &exphead, entries) { putdef = 0; if (put_exlist(ep->ex_dirl, xdrsp, ep->ex_defdir, &putdef, brief)) goto errout; if (ep->ex_defdir && putdef == 0 && put_exlist(ep->ex_defdir, xdrsp, (struct dirlist *)NULL, &putdef, brief)) goto errout; } sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); if (!xdr_bool(xdrsp, &false)) return (0); return (1); errout: sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return (0); } /* * Called from xdr_explist() to traverse the tree and export the * directory paths. */ static int put_exlist(struct dirlist *dp, XDR *xdrsp, struct dirlist *adp, int *putdefp, int brief) { struct grouplist *grp; struct hostlist *hp; int true = 1; int false = 0; int gotalldir = 0; char *strp; if (dp) { if (put_exlist(dp->dp_left, xdrsp, adp, putdefp, brief)) return (1); if (!xdr_bool(xdrsp, &true)) return (1); strp = dp->dp_dirp; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (1); if (adp && !strcmp(dp->dp_dirp, adp->dp_dirp)) { gotalldir = 1; *putdefp = 1; } if (brief) { if (!xdr_bool(xdrsp, &true)) return (1); strp = "(...)"; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (1); } else if ((dp->dp_flag & DP_DEFSET) == 0 && (gotalldir == 0 || (adp->dp_flag & DP_DEFSET) == 0)) { hp = dp->dp_hosts; while (hp) { grp = hp->ht_grp; if (grp->gr_type == GT_HOST) { if (!xdr_bool(xdrsp, &true)) return (1); strp = grp->gr_ptr.gt_addrinfo->ai_canonname; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (1); } else if (grp->gr_type == GT_NET) { if (!xdr_bool(xdrsp, &true)) return (1); strp = grp->gr_ptr.gt_net.nt_name; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (1); } hp = hp->ht_next; if (gotalldir && hp == (struct hostlist *)NULL) { hp = adp->dp_hosts; gotalldir = 0; } } } if (!xdr_bool(xdrsp, &false)) return (1); if (put_exlist(dp->dp_right, xdrsp, adp, putdefp, brief)) return (1); } return (0); } static int xdr_explist(XDR *xdrsp, caddr_t cp) { return xdr_explist_common(xdrsp, cp, 0); } static int xdr_explist_brief(XDR *xdrsp, caddr_t cp) { return xdr_explist_common(xdrsp, cp, 1); } static char *line; static size_t linesize; static FILE *exp_file; /* * Get the export list from one, currently open file */ static void get_exportlist_one(void) { struct exportlist *ep; struct grouplist *grp, *tgrp; struct dirlist *dirhead; struct statfs fsb; struct xucred anon; char *cp, *endcp, *dirp, *hst, *usr, *dom, savedc; int len, has_host, exflags, got_nondir, dirplen, netgrp; v4root_phase = 0; dirhead = (struct dirlist *)NULL; while (get_line()) { if (debug) warnx("got line %s", line); cp = line; nextfield(&cp, &endcp); if (*cp == '#') goto nextline; /* * Set defaults. */ has_host = FALSE; anon = def_anon; exflags = MNT_EXPORTED; got_nondir = 0; opt_flags = 0; ep = (struct exportlist *)NULL; dirp = NULL; /* * Handle the V4 root dir. */ if (*cp == 'V' && *(cp + 1) == '4' && *(cp + 2) == ':') { /* * V4: just indicates that it is the v4 root point, * so skip over that and set v4root_phase. */ if (v4root_phase > 0) { syslog(LOG_ERR, "V4:duplicate line, ignored"); goto nextline; } v4root_phase = 1; cp += 3; nextfield(&cp, &endcp); } /* * Create new exports list entry */ len = endcp-cp; tgrp = grp = get_grp(); while (len > 0) { if (len > MNTNAMLEN) { getexp_err(ep, tgrp, "mountpoint too long"); goto nextline; } if (*cp == '-') { if (ep == (struct exportlist *)NULL) { getexp_err(ep, tgrp, "flag before export path definition"); goto nextline; } if (debug) warnx("doing opt %s", cp); got_nondir = 1; if (do_opt(&cp, &endcp, ep, grp, &has_host, &exflags, &anon)) { getexp_err(ep, tgrp, NULL); goto nextline; } } else if (*cp == '/') { savedc = *endcp; *endcp = '\0'; if (v4root_phase > 1) { if (dirp != NULL) { getexp_err(ep, tgrp, "Multiple V4 dirs"); goto nextline; } } if (check_dirpath(cp) && statfs(cp, &fsb) >= 0) { if ((fsb.f_flags & MNT_AUTOMOUNTED) != 0) syslog(LOG_ERR, "Warning: exporting of " "automounted fs %s not supported", cp); if (got_nondir) { getexp_err(ep, tgrp, "dirs must be first"); goto nextline; } if (v4root_phase == 1) { if (dirp != NULL) { getexp_err(ep, tgrp, "Multiple V4 dirs"); goto nextline; } if (strlen(v4root_dirpath) == 0) { strlcpy(v4root_dirpath, cp, sizeof (v4root_dirpath)); } else if (strcmp(v4root_dirpath, cp) != 0) { syslog(LOG_ERR, "different V4 dirpath %s", cp); getexp_err(ep, tgrp, NULL); goto nextline; } dirp = cp; v4root_phase = 2; got_nondir = 1; ep = get_exp(); } else { if (ep) { if (ep->ex_fs.val[0] != fsb.f_fsid.val[0] || ep->ex_fs.val[1] != fsb.f_fsid.val[1]) { getexp_err(ep, tgrp, "fsid mismatch"); goto nextline; } } else { /* * See if this directory is already * in the list. */ ep = ex_search(&fsb.f_fsid, &exphead); if (ep == (struct exportlist *)NULL) { ep = get_exp(); ep->ex_fs = fsb.f_fsid; ep->ex_fsdir = strdup(fsb.f_mntonname); if (ep->ex_fsdir == NULL) out_of_mem(); if (debug) warnx( "making new ep fs=0x%x,0x%x", fsb.f_fsid.val[0], fsb.f_fsid.val[1]); } else if (debug) warnx("found ep fs=0x%x,0x%x", fsb.f_fsid.val[0], fsb.f_fsid.val[1]); } /* * Add dirpath to export mount point. */ dirp = add_expdir(&dirhead, cp, len); dirplen = len; } } else { getexp_err(ep, tgrp, "symbolic link in export path or statfs failed"); goto nextline; } *endcp = savedc; } else { savedc = *endcp; *endcp = '\0'; got_nondir = 1; if (ep == (struct exportlist *)NULL) { getexp_err(ep, tgrp, "host(s) before export path definition"); goto nextline; } /* * Get the host or netgroup. */ setnetgrent(cp); netgrp = getnetgrent(&hst, &usr, &dom); do { if (has_host) { grp->gr_next = get_grp(); grp = grp->gr_next; } if (netgrp) { if (hst == 0) { syslog(LOG_ERR, "null hostname in netgroup %s, skipping", cp); grp->gr_type = GT_IGNORE; } else if (get_host(hst, grp, tgrp)) { syslog(LOG_ERR, "bad host %s in netgroup %s, skipping", hst, cp); grp->gr_type = GT_IGNORE; } } else if (get_host(cp, grp, tgrp)) { syslog(LOG_ERR, "bad host %s, skipping", cp); grp->gr_type = GT_IGNORE; } has_host = TRUE; } while (netgrp && getnetgrent(&hst, &usr, &dom)); endnetgrent(); *endcp = savedc; } cp = endcp; nextfield(&cp, &endcp); len = endcp - cp; } if (check_options(dirhead)) { getexp_err(ep, tgrp, NULL); goto nextline; } if (!has_host) { grp->gr_type = GT_DEFAULT; if (debug) warnx("adding a default entry"); /* * Don't allow a network export coincide with a list of * host(s) on the same line. */ } else if ((opt_flags & OP_NET) && tgrp->gr_next) { getexp_err(ep, tgrp, "network/host conflict"); goto nextline; /* * If an export list was specified on this line, make sure * that we have at least one valid entry, otherwise skip it. */ } else { grp = tgrp; while (grp && grp->gr_type == GT_IGNORE) grp = grp->gr_next; if (! grp) { getexp_err(ep, tgrp, "no valid entries"); goto nextline; } } if (v4root_phase == 1) { getexp_err(ep, tgrp, "V4:root, no dirp, ignored"); goto nextline; } /* * Loop through hosts, pushing the exports into the kernel. * After loop, tgrp points to the start of the list and * grp points to the last entry in the list. */ grp = tgrp; do { if (do_mount(ep, grp, exflags, &anon, dirp, dirplen, &fsb)) { getexp_err(ep, tgrp, NULL); goto nextline; } } while (grp->gr_next && (grp = grp->gr_next)); /* * For V4: don't enter in mount lists. */ if (v4root_phase > 0 && v4root_phase <= 2) { /* * Since these structures aren't used by mountd, * free them up now. */ if (ep != NULL) free_exp(ep); while (tgrp != NULL) { grp = tgrp; tgrp = tgrp->gr_next; free_grp(grp); } goto nextline; } /* * Success. Update the data structures. */ if (has_host) { hang_dirp(dirhead, tgrp, ep, opt_flags); - grp->gr_next = grphead; - grphead = tgrp; + grp->gr_next = ep->ex_grphead; + ep->ex_grphead = tgrp; } else { hang_dirp(dirhead, (struct grouplist *)NULL, ep, opt_flags); free_grp(grp); } dirhead = (struct dirlist *)NULL; if ((ep->ex_flag & EX_LINKED) == 0) { insert_exports(ep, &exphead); ep->ex_flag |= EX_LINKED; } nextline: v4root_phase = 0; if (dirhead) { free_dir(dirhead); dirhead = (struct dirlist *)NULL; } } } /* * Get the export list from all specified files */ static void get_exportlist(void) { - struct grouplist *grp, *tgrp; struct export_args export; struct iovec *iov; struct statfs *mntbufp; char errmsg[255]; int num, i; int iovlen; struct nfsex_args eargs; if (suspend_nfsd != 0) (void)nfssvc(NFSSVC_SUSPENDNFSD, NULL); v4root_dirpath[0] = '\0'; bzero(&export, sizeof(export)); export.ex_flags = MNT_DELEXPORT; iov = NULL; iovlen = 0; bzero(errmsg, sizeof(errmsg)); /* * First, get rid of the old list */ free_exports(&exphead); - grp = grphead; - while (grp) { - tgrp = grp; - grp = grp->gr_next; - free_grp(tgrp); - } - grphead = (struct grouplist *)NULL; - /* * and the old V4 root dir. */ bzero(&eargs, sizeof (eargs)); eargs.export.ex_flags = MNT_DELEXPORT; if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&eargs) < 0 && errno != ENOENT) syslog(LOG_ERR, "Can't delete exports for V4:"); /* * and clear flag that notes if a public fh has been exported. */ has_publicfh = 0; /* * And delete exports that are in the kernel for all local * filesystems. * XXX: Should know how to handle all local exportable filesystems. */ num = getmntinfo(&mntbufp, MNT_NOWAIT); if (num > 0) { build_iovec(&iov, &iovlen, "fstype", NULL, 0); build_iovec(&iov, &iovlen, "fspath", NULL, 0); build_iovec(&iov, &iovlen, "from", NULL, 0); build_iovec(&iov, &iovlen, "update", NULL, 0); build_iovec(&iov, &iovlen, "export", &export, sizeof(export)); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); } for (i = 0; i < num; i++) delete_export(iov, iovlen, &mntbufp[i], errmsg); if (iov != NULL) { /* Free strings allocated by strdup() in getmntopts.c */ free(iov[0].iov_base); /* fstype */ free(iov[2].iov_base); /* fspath */ free(iov[4].iov_base); /* from */ free(iov[6].iov_base); /* update */ free(iov[8].iov_base); /* export */ free(iov[10].iov_base); /* errmsg */ /* free iov, allocated by realloc() */ free(iov); iovlen = 0; } read_exportfile(); /* * If there was no public fh, clear any previous one set. */ if (has_publicfh == 0) (void) nfssvc(NFSSVC_NOPUBLICFH, NULL); /* Resume the nfsd. If they weren't suspended, this is harmless. */ (void)nfssvc(NFSSVC_RESUMENFSD, NULL); } /* * Insert an export entry in the appropriate list. */ static void insert_exports(struct exportlist *ep, struct exportlisthead *exhp) { SLIST_INSERT_HEAD(exhp, ep, entries); } /* * Free up the exports lists passed in as arguments. */ static void free_exports(struct exportlisthead *exhp) { struct exportlist *ep, *ep2; SLIST_FOREACH_SAFE(ep, exhp, entries, ep2) { SLIST_REMOVE(exhp, ep, exportlist, entries); free_exp(ep); } SLIST_INIT(exhp); } /* * Read the exports file(s) and call get_exportlist_one() for each line. */ static void read_exportfile(void) { int done, i; /* * Read in the exports file and build the list, calling * nmount() as we go along to push the export rules into the kernel. */ done = 0; for (i = 0; exnames[i] != NULL; i++) { if (debug) warnx("reading exports from %s", exnames[i]); if ((exp_file = fopen(exnames[i], "r")) == NULL) { syslog(LOG_WARNING, "can't open %s", exnames[i]); continue; } get_exportlist_one(); fclose(exp_file); done++; } if (done == 0) { syslog(LOG_ERR, "can't open any exports file"); exit(2); } } /* * Delete an exports entry. */ static void delete_export(struct iovec *iov, int iovlen, struct statfs *fsp, char *errmsg) { struct xvfsconf vfc; if (getvfsbyname(fsp->f_fstypename, &vfc) != 0) { syslog(LOG_ERR, "getvfsbyname() failed for %s", fsp->f_fstypename); return; } /* * We do not need to delete "export" flag from * filesystems that do not have it set. */ if (!(fsp->f_flags & MNT_EXPORTED)) return; /* * Do not delete export for network filesystem by * passing "export" arg to nmount(). * It only makes sense to do this for local filesystems. */ if (vfc.vfc_flags & VFCF_NETWORK) return; iov[1].iov_base = fsp->f_fstypename; iov[1].iov_len = strlen(fsp->f_fstypename) + 1; iov[3].iov_base = fsp->f_mntonname; iov[3].iov_len = strlen(fsp->f_mntonname) + 1; iov[5].iov_base = fsp->f_mntfromname; iov[5].iov_len = strlen(fsp->f_mntfromname) + 1; errmsg[0] = '\0'; /* * EXDEV is returned when path exists but is not a * mount point. May happens if raced with unmount. */ if (nmount(iov, iovlen, fsp->f_flags) < 0 && errno != ENOENT && errno != ENOTSUP && errno != EXDEV) { syslog(LOG_ERR, "can't delete exports for %s: %m %s", fsp->f_mntonname, errmsg); } } /* * Allocate an export list element */ static struct exportlist * get_exp(void) { struct exportlist *ep; ep = (struct exportlist *)calloc(1, sizeof (struct exportlist)); if (ep == (struct exportlist *)NULL) out_of_mem(); return (ep); } /* * Allocate a group list element */ static struct grouplist * get_grp(void) { struct grouplist *gp; gp = (struct grouplist *)calloc(1, sizeof (struct grouplist)); if (gp == (struct grouplist *)NULL) out_of_mem(); return (gp); } /* * Clean up upon an error in get_exportlist(). */ static void getexp_err(struct exportlist *ep, struct grouplist *grp, const char *reason) { struct grouplist *tgrp; if (!(opt_flags & OP_QUIET)) { if (reason != NULL) syslog(LOG_ERR, "bad exports list line '%s': %s", line, reason); else syslog(LOG_ERR, "bad exports list line '%s'", line); } if (ep && (ep->ex_flag & EX_LINKED) == 0) free_exp(ep); while (grp) { tgrp = grp; grp = grp->gr_next; free_grp(tgrp); } } /* * Search the export list for a matching fs. */ static struct exportlist * ex_search(fsid_t *fsid, struct exportlisthead *exhp) { struct exportlist *ep; SLIST_FOREACH(ep, exhp, entries) { if (ep->ex_fs.val[0] == fsid->val[0] && ep->ex_fs.val[1] == fsid->val[1]) return (ep); } return (ep); } /* * Add a directory path to the list. */ static char * add_expdir(struct dirlist **dpp, char *cp, int len) { struct dirlist *dp; dp = malloc(sizeof (struct dirlist)); if (dp == (struct dirlist *)NULL) out_of_mem(); dp->dp_left = *dpp; dp->dp_right = (struct dirlist *)NULL; dp->dp_flag = 0; dp->dp_hosts = (struct hostlist *)NULL; dp->dp_dirp = strndup(cp, len); if (dp->dp_dirp == NULL) out_of_mem(); *dpp = dp; return (dp->dp_dirp); } /* * Hang the dir list element off the dirpath binary tree as required * and update the entry for host. */ static void hang_dirp(struct dirlist *dp, struct grouplist *grp, struct exportlist *ep, int flags) { struct hostlist *hp; struct dirlist *dp2; if (flags & OP_ALLDIRS) { if (ep->ex_defdir) free((caddr_t)dp); else ep->ex_defdir = dp; if (grp == (struct grouplist *)NULL) { ep->ex_defdir->dp_flag |= DP_DEFSET; /* Save the default security flavors list. */ ep->ex_defnumsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(ep->ex_defsecflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); } else while (grp) { hp = get_ht(); hp->ht_grp = grp; hp->ht_next = ep->ex_defdir->dp_hosts; ep->ex_defdir->dp_hosts = hp; /* Save the security flavors list for this host set. */ grp->gr_numsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(grp->gr_secflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); grp = grp->gr_next; } } else { /* * Loop through the directories adding them to the tree. */ while (dp) { dp2 = dp->dp_left; add_dlist(&ep->ex_dirl, dp, grp, flags, ep); dp = dp2; } } } /* * Traverse the binary tree either updating a node that is already there * for the new directory or adding the new node. */ static void add_dlist(struct dirlist **dpp, struct dirlist *newdp, struct grouplist *grp, int flags, struct exportlist *ep) { struct dirlist *dp; struct hostlist *hp; int cmp; dp = *dpp; if (dp) { cmp = strcmp(dp->dp_dirp, newdp->dp_dirp); if (cmp > 0) { add_dlist(&dp->dp_left, newdp, grp, flags, ep); return; } else if (cmp < 0) { add_dlist(&dp->dp_right, newdp, grp, flags, ep); return; } else free((caddr_t)newdp); } else { dp = newdp; dp->dp_left = (struct dirlist *)NULL; *dpp = dp; } if (grp) { /* * Hang all of the host(s) off of the directory point. */ do { hp = get_ht(); hp->ht_grp = grp; hp->ht_next = dp->dp_hosts; dp->dp_hosts = hp; /* Save the security flavors list for this host set. */ grp->gr_numsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(grp->gr_secflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); grp = grp->gr_next; } while (grp); } else { dp->dp_flag |= DP_DEFSET; /* Save the default security flavors list. */ ep->ex_defnumsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(ep->ex_defsecflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); } } /* * Search for a dirpath on the export point. */ static struct dirlist * dirp_search(struct dirlist *dp, char *dirp) { int cmp; if (dp) { cmp = strcmp(dp->dp_dirp, dirp); if (cmp > 0) return (dirp_search(dp->dp_left, dirp)); else if (cmp < 0) return (dirp_search(dp->dp_right, dirp)); else return (dp); } return (dp); } /* * Scan for a host match in a directory tree. */ static int chk_host(struct dirlist *dp, struct sockaddr *saddr, int *defsetp, int *hostsetp, int *numsecflavors, int **secflavorsp) { struct hostlist *hp; struct grouplist *grp; struct addrinfo *ai; if (dp) { if (dp->dp_flag & DP_DEFSET) *defsetp = dp->dp_flag; hp = dp->dp_hosts; while (hp) { grp = hp->ht_grp; switch (grp->gr_type) { case GT_HOST: ai = grp->gr_ptr.gt_addrinfo; for (; ai; ai = ai->ai_next) { if (!sacmp(ai->ai_addr, saddr, NULL)) { *hostsetp = (hp->ht_flag | DP_HOSTSET); if (numsecflavors != NULL) { *numsecflavors = grp->gr_numsecflavors; *secflavorsp = grp->gr_secflavors; } return (1); } } break; case GT_NET: if (!sacmp(saddr, (struct sockaddr *) &grp->gr_ptr.gt_net.nt_net, (struct sockaddr *) &grp->gr_ptr.gt_net.nt_mask)) { *hostsetp = (hp->ht_flag | DP_HOSTSET); if (numsecflavors != NULL) { *numsecflavors = grp->gr_numsecflavors; *secflavorsp = grp->gr_secflavors; } return (1); } break; } hp = hp->ht_next; } } return (0); } /* * Scan tree for a host that matches the address. */ static int scan_tree(struct dirlist *dp, struct sockaddr *saddr) { int defset, hostset; if (dp) { if (scan_tree(dp->dp_left, saddr)) return (1); if (chk_host(dp, saddr, &defset, &hostset, NULL, NULL)) return (1); if (scan_tree(dp->dp_right, saddr)) return (1); } return (0); } /* * Traverse the dirlist tree and free it up. */ static void free_dir(struct dirlist *dp) { if (dp) { free_dir(dp->dp_left); free_dir(dp->dp_right); free_host(dp->dp_hosts); free(dp->dp_dirp); free(dp); } } /* * Parse a colon separated list of security flavors */ static int parsesec(char *seclist, struct exportlist *ep) { char *cp, savedc; int flavor; ep->ex_numsecflavors = 0; for (;;) { cp = strchr(seclist, ':'); if (cp) { savedc = *cp; *cp = '\0'; } if (!strcmp(seclist, "sys")) flavor = AUTH_SYS; else if (!strcmp(seclist, "krb5")) flavor = RPCSEC_GSS_KRB5; else if (!strcmp(seclist, "krb5i")) flavor = RPCSEC_GSS_KRB5I; else if (!strcmp(seclist, "krb5p")) flavor = RPCSEC_GSS_KRB5P; else { if (cp) *cp = savedc; syslog(LOG_ERR, "bad sec flavor: %s", seclist); return (1); } if (ep->ex_numsecflavors == MAXSECFLAVORS) { if (cp) *cp = savedc; syslog(LOG_ERR, "too many sec flavors: %s", seclist); return (1); } ep->ex_secflavors[ep->ex_numsecflavors] = flavor; ep->ex_numsecflavors++; if (cp) { *cp = savedc; seclist = cp + 1; } else { break; } } return (0); } /* * Parse the option string and update fields. * Option arguments may either be -