Index: contrib/llvm/include/llvm/CodeGen/Passes.h =================================================================== --- contrib/llvm/include/llvm/CodeGen/Passes.h +++ contrib/llvm/include/llvm/CodeGen/Passes.h @@ -420,6 +420,9 @@ /// shuffles. FunctionPass *createExpandReductionsPass(); + // This pass expands indirectbr instructions. + FunctionPass *createIndirectBrExpandPass(); + } // End llvm namespace #endif Index: contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ contrib/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -406,6 +406,13 @@ /// immediately before machine code is emitted. virtual void addPreEmitPass() { } + /// Targets may add passes immediately before machine code is emitted in this + /// callback. This is called even later than `addPreEmitPass`. + // FIXME: Rename `addPreEmitPass` to something more sensible given its actual + // position and remove the `2` suffix here as this callback is what + // `addPreEmitPass` *should* be but in reality isn't. + virtual void addPreEmitPass2() {} + /// Utilities for targets to add passes to the pass manager. /// Index: contrib/llvm/include/llvm/InitializePasses.h =================================================================== --- contrib/llvm/include/llvm/InitializePasses.h +++ contrib/llvm/include/llvm/InitializePasses.h @@ -157,6 +157,7 @@ void initializeIfConverterPass(PassRegistry&); void initializeImplicitNullChecksPass(PassRegistry&); void initializeIndVarSimplifyLegacyPassPass(PassRegistry&); +void initializeIndirectBrExpandPassPass(PassRegistry&); void initializeInductiveRangeCheckEliminationPass(PassRegistry&); void initializeInferAddressSpacesPass(PassRegistry&); void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&); Index: contrib/llvm/include/llvm/Target/TargetLowering.h =================================================================== --- contrib/llvm/include/llvm/Target/TargetLowering.h +++ contrib/llvm/include/llvm/Target/TargetLowering.h @@ -799,7 +799,7 @@ } /// Return true if lowering to a jump table is allowed. - bool areJTsAllowed(const Function *Fn) const { + virtual bool areJTsAllowed(const Function *Fn) const { if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") return false; Index: contrib/llvm/include/llvm/Target/TargetSubtargetInfo.h =================================================================== --- contrib/llvm/include/llvm/Target/TargetSubtargetInfo.h +++ contrib/llvm/include/llvm/Target/TargetSubtargetInfo.h @@ -172,6 +172,9 @@ /// \brief True if the subtarget should run the atomic expansion pass. virtual bool enableAtomicExpand() const; + /// True if the subtarget should run the indirectbr expansion pass. + virtual bool enableIndirectBrExpand() const; + /// \brief Override generic scheduling policy within a region. /// /// This is a convenient way for targets that don't provide any custom Index: contrib/llvm/lib/CodeGen/CodeGen.cpp =================================================================== --- contrib/llvm/lib/CodeGen/CodeGen.cpp +++ contrib/llvm/lib/CodeGen/CodeGen.cpp @@ -39,6 +39,7 @@ initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); Index: contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp =================================================================== --- /dev/null +++ contrib/llvm/lib/CodeGen/IndirectBrExpandPass.cpp @@ -0,0 +1,221 @@ +//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Implements an expansion pass to turn `indirectbr` instructions in the IR +/// into `switch` instructions. This works by enumerating the basic blocks in +/// a dense range of integers, replacing each `blockaddr` constant with the +/// corresponding integer constant, and then building a switch that maps from +/// the integers to the actual blocks. All of the indirectbr instructions in the +/// function are redirected to this common switch. +/// +/// While this is generically useful if a target is unable to codegen +/// `indirectbr` natively, it is primarily useful when there is some desire to +/// get the builtin non-jump-table lowering of a switch even when the input +/// source contained an explicit indirect branch construct. +/// +/// Note that it doesn't make any sense to enable this pass unless a target also +/// disables jump-table lowering of switches. Doing that is likely to pessimize +/// the code. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "indirectbr-expand" + +namespace { + +class IndirectBrExpandPass : public FunctionPass { + const TargetLowering *TLI = nullptr; + +public: + static char ID; // Pass identification, replacement for typeid + + IndirectBrExpandPass() : FunctionPass(ID) { + initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char IndirectBrExpandPass::ID = 0; + +INITIALIZE_PASS(IndirectBrExpandPass, DEBUG_TYPE, + "Expand indirectbr instructions", false, false) + +FunctionPass *llvm::createIndirectBrExpandPass() { + return new IndirectBrExpandPass(); +} + +bool IndirectBrExpandPass::runOnFunction(Function &F) { + auto &DL = F.getParent()->getDataLayout(); + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + auto &TM = TPC->getTM(); + auto &STI = *TM.getSubtargetImpl(F); + if (!STI.enableIndirectBrExpand()) + return false; + TLI = STI.getTargetLowering(); + + SmallVector IndirectBrs; + + // Set of all potential successors for indirectbr instructions. + SmallPtrSet IndirectBrSuccs; + + // Build a list of indirectbrs that we want to rewrite. + for (BasicBlock &BB : F) + if (auto *IBr = dyn_cast(BB.getTerminator())) { + // Handle the degenerate case of no successors by replacing the indirectbr + // with unreachable as there is no successor available. + if (IBr->getNumSuccessors() == 0) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + continue; + } + + IndirectBrs.push_back(IBr); + for (BasicBlock *SuccBB : IBr->successors()) + IndirectBrSuccs.insert(SuccBB); + } + + if (IndirectBrs.empty()) + return false; + + // If we need to replace any indirectbrs we need to establish integer + // constants that will correspond to each of the basic blocks in the function + // whose address escapes. We do that here and rewrite all the blockaddress + // constants to just be those integer constants cast to a pointer type. + SmallVector BBs; + + for (BasicBlock &BB : F) { + // Skip blocks that aren't successors to an indirectbr we're going to + // rewrite. + if (!IndirectBrSuccs.count(&BB)) + continue; + + auto IsBlockAddressUse = [&](const Use &U) { + return isa(U.getUser()); + }; + auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse); + if (BlockAddressUseIt == BB.use_end()) + continue; + + assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(), + IsBlockAddressUse) == BB.use_end() && + "There should only ever be a single blockaddress use because it is " + "a constant and should be uniqued."); + + auto *BA = cast(BlockAddressUseIt->getUser()); + + // Skip if the constant was formed but ended up not being used (due to DCE + // or whatever). + if (!BA->isConstantUsed()) + continue; + + // Compute the index we want to use for this basic block. We can't use zero + // because null can be compared with block addresses. + int BBIndex = BBs.size() + 1; + BBs.push_back(&BB); + + auto *ITy = cast(DL.getIntPtrType(BA->getType())); + ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex); + + // Now rewrite the blockaddress to an integer constant based on the index. + // FIXME: We could potentially preserve the uses as arguments to inline asm. + // This would allow some uses such as diagnostic information in crashes to + // have higher quality even when this transform is enabled, but would break + // users that round-trip blockaddresses through inline assembly and then + // back into an indirectbr. + BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType())); + } + + if (BBs.empty()) { + // There are no blocks whose address is taken, so any indirectbr instruction + // cannot get a valid input and we can replace all of them with unreachable. + for (auto *IBr : IndirectBrs) { + (void)new UnreachableInst(F.getContext(), IBr); + IBr->eraseFromParent(); + } + return true; + } + + BasicBlock *SwitchBB; + Value *SwitchValue; + + // Compute a common integer type across all the indirectbr instructions. + IntegerType *CommonITy = nullptr; + for (auto *IBr : IndirectBrs) { + auto *ITy = + cast(DL.getIntPtrType(IBr->getAddress()->getType())); + if (!CommonITy || ITy->getBitWidth() > CommonITy->getBitWidth()) + CommonITy = ITy; + } + + auto GetSwitchValue = [DL, CommonITy](IndirectBrInst *IBr) { + return CastInst::CreatePointerCast( + IBr->getAddress(), CommonITy, + Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr); + }; + + if (IndirectBrs.size() == 1) { + // If we only have one indirectbr, we can just directly replace it within + // its block. + SwitchBB = IndirectBrs[0]->getParent(); + SwitchValue = GetSwitchValue(IndirectBrs[0]); + IndirectBrs[0]->eraseFromParent(); + } else { + // Otherwise we need to create a new block to hold the switch across BBs, + // jump to that block instead of each indirectbr, and phi together the + // values for the switch. + SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F); + auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(), + "switch_value_phi", SwitchBB); + SwitchValue = SwitchPN; + + // Now replace the indirectbr instructions with direct branches to the + // switch block and fill out the PHI operands. + for (auto *IBr : IndirectBrs) { + SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent()); + BranchInst::Create(SwitchBB, IBr); + IBr->eraseFromParent(); + } + } + + // Now build the switch in the block. The block will have no terminator + // already. + auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB); + + // Add a case for each block. + for (int i : llvm::seq(1, BBs.size())) + SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]); + + return true; +} Index: contrib/llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- contrib/llvm/lib/CodeGen/TargetPassConfig.cpp +++ contrib/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -790,6 +790,9 @@ if (EnableMachineOutliner) PM->add(createMachineOutlinerPass()); + // Add passes that directly emit MI after all other MI passes. + addPreEmitPass2(); + AddingMachinePasses = false; } Index: contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp =================================================================== --- contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -37,6 +37,10 @@ return true; } +bool TargetSubtargetInfo::enableIndirectBrExpand() const { + return false; +} + bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } Index: contrib/llvm/lib/Target/X86/X86.h =================================================================== --- contrib/llvm/lib/Target/X86/X86.h +++ contrib/llvm/lib/Target/X86/X86.h @@ -22,6 +22,7 @@ class FunctionPass; class ImmutablePass; class InstructionSelector; +class ModulePass; class PassRegistry; class X86RegisterBankInfo; class X86Subtarget; @@ -97,6 +98,9 @@ /// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); + +/// This pass creates the thunks for the retpoline feature. +FunctionPass *createX86RetpolineThunksPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, Index: contrib/llvm/lib/Target/X86/X86.td =================================================================== --- contrib/llvm/lib/Target/X86/X86.td +++ contrib/llvm/lib/Target/X86/X86.td @@ -290,6 +290,27 @@ "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// Index: contrib/llvm/lib/Target/X86/X86AsmPrinter.h =================================================================== --- contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -30,6 +30,7 @@ StackMaps SM; FaultMaps FM; std::unique_ptr CodeEmitter; + bool NeedsRetpoline = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow Index: contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -344,6 +344,8 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { unsigned Reg = MO.getReg(); + bool EmitPercent = true; + switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register @@ -358,6 +360,9 @@ case 'k': // Print SImode register Reg = getX86SubSuperRegister(Reg, 32); break; + case 'V': + EmitPercent = false; + LLVM_FALLTHROUGH; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. @@ -365,7 +370,10 @@ break; } - O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + if (EmitPercent) + O << '%'; + + O << X86ATTInstPrinter::getRegisterName(Reg); return false; } @@ -438,6 +446,7 @@ case 'w': // Print HImode register case 'k': // Print SImode register case 'q': // Print DImode register + case 'V': // Print native register without '%' if (MO.isReg()) return printAsmMRegister(*this, MO, ExtraCode[0], O); printOperand(*this, MI, OpNo, O); Index: contrib/llvm/lib/Target/X86/X86FastISel.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -3161,6 +3161,10 @@ (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers"))) return false; + // Functions using retpoline should use SDISel for calls. + if (Subtarget->useRetpoline()) + return false; + // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; Index: contrib/llvm/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -742,6 +742,11 @@ bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + // FIXME: Add retpoline support and remove this. + if (Is64Bit && IsLargeCodeModel && STI.useRetpoline()) + report_fatal_error("Emitting stack probe calls on 64-bit with the large " + "code model and retpoline not yet implemented."); + unsigned CallOp; if (Is64Bit) CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; @@ -2337,6 +2342,10 @@ // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. + // FIXME: Add retpoline support and remove the error here.. + if (STI.useRetpoline()) + report_fatal_error("Emitting morestack calls on 64-bit with the large " + "code model and retpoline not yet implemented."); BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) Index: contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -550,11 +550,11 @@ SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && - // Only does this when target favors doesn't favor register indirect - // call. + // Only do this when the target can fold the load into the call or + // jmp. + !Subtarget->useRetpoline() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || (N->getOpcode() == X86ISD::TC_RETURN && - // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just Index: contrib/llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -986,6 +986,9 @@ bool isVectorClearMaskLegal(const SmallVectorImpl &Mask, EVT VT) const override; + /// Returns true if lowering to a jump table is allowed. + bool areJTsAllowed(const Function *Fn) const override; + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. @@ -1288,6 +1291,9 @@ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; Index: contrib/llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24994,6 +24994,15 @@ return isShuffleMaskLegal(Mask, VT); } +bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { + // If the subtarget is using retpolines, we need to not generate jump tables. + if (Subtarget.useRetpoline()) + return false; + + // Otherwise, fallback on the generic logic. + return TargetLowering::areJTsAllowed(Fn); +} + //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// @@ -26225,7 +26234,131 @@ return BB; } +static unsigned getOpcodeForRetpoline(unsigned RPOpc) { + switch (RPOpc) { + case X86::RETPOLINE_CALL32: + return X86::CALLpcrel32; + case X86::RETPOLINE_CALL64: + return X86::CALL64pcrel32; + case X86::RETPOLINE_TCRETURN32: + return X86::TCRETURNdi; + case X86::RETPOLINE_TCRETURN64: + return X86::TCRETURNdi64; + } + llvm_unreachable("not retpoline opcode"); +} + +static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, + unsigned Reg) { + if (Subtarget.useRetpolineExternalThunk()) { + // When using an external thunk for retpolines, we pick names that match the + // names GCC happens to use as well. This helps simplify the implementation + // of the thunks for kernels where they have no easy ability to create + // aliases and are doing non-trivial configuration of the thunk's body. For + // example, the Linux kernel will do boot-time hot patching of the thunk + // bodies and cannot easily export aliases of these to loaded modules. + // + // Note that at any point in the future, we may need to change the semantics + // of how we implement retpolines and at that time will likely change the + // name of the called thunk. Essentially, there is no hard guarantee that + // LLVM will generate calls to specific thunks, we merely make a best-effort + // attempt to help out kernels and other systems where duplicating the + // thunks is costly. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__x86_indirect_thunk_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__x86_indirect_thunk_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); + } + + // When targeting an internal COMDAT thunk use an LLVM-specific name. + switch (Reg) { + case X86::EAX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_eax"; + case X86::ECX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_ecx"; + case X86::EDX: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edx"; + case X86::EDI: + assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); + return "__llvm_retpoline_edi"; + case X86::R11: + assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); + return "__llvm_retpoline_r11"; + } + llvm_unreachable("unexpected reg for retpoline"); +} + MachineBasicBlock * +X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, + MachineBasicBlock *BB) const { + // Copy the virtual register into the R11 physical register and + // call the retpoline thunk. + DebugLoc DL = MI.getDebugLoc(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + unsigned CalleeVReg = MI.getOperand(0).getReg(); + unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); + + // Find an available scratch register to hold the callee. On 64-bit, we can + // just use R11, but we scan for uses anyway to ensure we don't generate + // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't + // already a register use operand to the call to hold the callee. If none + // are available, use EDI instead. EDI is chosen because EBX is the PIC base + // register and ESI is the base pointer to realigned stack frames with VLAs. + SmallVector AvailableRegs; + if (Subtarget.is64Bit()) + AvailableRegs.push_back(X86::R11); + else + AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); + + // Zero out any registers that are already used. + for (const auto &MO : MI.operands()) { + if (MO.isReg() && MO.isUse()) + for (unsigned &Reg : AvailableRegs) + if (Reg == MO.getReg()) + Reg = 0; + } + + // Choose the first remaining non-zero available register. + unsigned AvailableReg = 0; + for (unsigned MaybeReg : AvailableRegs) { + if (MaybeReg) { + AvailableReg = MaybeReg; + break; + } + } + if (!AvailableReg) + report_fatal_error("calling convention incompatible with retpoline, no " + "available registers"); + + const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); + + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) + .addReg(CalleeVReg); + MI.getOperand(0).ChangeToES(Symbol); + MI.setDesc(TII->get(Opc)); + MachineInstrBuilder(*BB->getParent(), &MI) + .addReg(AvailableReg, RegState::Implicit | RegState::Kill); + return BB; +} + +MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); @@ -26689,6 +26822,11 @@ case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); + case X86::RETPOLINE_CALL32: + case X86::RETPOLINE_CALL64: + case X86::RETPOLINE_TCRETURN32: + case X86::RETPOLINE_TCRETURN64: + return EmitLoweredRetpoline(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: Index: contrib/llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1106,14 +1106,14 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[Not64BitMode, NotUseRetpoline]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[Not64BitMode, IsNotPIC]>; + Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi tglobaladdr:$dst, imm:$off)>, @@ -1125,13 +1125,21 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode, NotUseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseRetpoline]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, Index: contrib/llvm/lib/Target/X86/X86InstrControl.td =================================================================== --- contrib/llvm/lib/Target/X86/X86InstrControl.td +++ contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -211,11 +211,12 @@ Sched<[WriteJumpLd]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, - OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>, + Sched<[WriteJump]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, OpSize32, - Requires<[Not64BitMode,FavorMemIndirectCall]>, + Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>, Sched<[WriteJumpLd]>; let Predicates = [Not64BitMode] in { @@ -298,11 +299,12 @@ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)], IIC_CALL_RI>, - Requires<[In64BitMode]>; + Requires<[In64BitMode,NotUseRetpoline]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>, - Requires<[In64BitMode,FavorMemIndirectCall]>; + Requires<[In64BitMode,FavorMemIndirectCall, + NotUseRetpoline]>; def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; @@ -338,6 +340,27 @@ let mayLoad = 1 in def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + } +} + +let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, + Uses = [RSP], + usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def RETPOLINE_CALL32 : + PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, + Requires<[Not64BitMode,UseRetpoline]>; + + def RETPOLINE_CALL64 : + PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, + Requires<[In64BitMode,UseRetpoline]>; + + // Retpoline variant of indirect tail calls. + let isTerminator = 1, isReturn = 1, isBarrier = 1 in { + def RETPOLINE_TCRETURN64 : + PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; + def RETPOLINE_TCRETURN32 : + PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; } } Index: contrib/llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -917,6 +917,8 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. Index: contrib/llvm/lib/Target/X86/X86MCInstLower.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -874,6 +874,10 @@ // address is to far away. (TODO: support non-relative addressing) break; case MachineOperand::MO_Register: + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error("Lowering register statepoints with retpoline not " + "yet implemented."); CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); CallOpcode = X86::CALL64r; break; @@ -1028,6 +1032,10 @@ EmitAndCountInstruction( MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); + // FIXME: Add retpoline support and remove this. + if (Subtarget->useRetpoline()) + report_fatal_error( + "Lowering patchpoint with retpoline not yet implemented."); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } Index: contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp =================================================================== --- /dev/null +++ contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp @@ -0,0 +1,265 @@ +//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that injects an MI thunk implementing a "retpoline". This is +/// a RET-implemented trampoline that is used to lower indirect calls in a way +/// that prevents speculation on some x86 processors and can be used to mitigate +/// security vulnerabilities due to targeted speculative execution and side +/// channels such as CVE-2017-5715. +/// +/// TODO(chandlerc): All of this code could use better comments and +/// documentation. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-retpoline-thunks" + +static const char ThunkNamePrefix[] = "__llvm_retpoline_"; +static const char R11ThunkName[] = "__llvm_retpoline_r11"; +static const char EAXThunkName[] = "__llvm_retpoline_eax"; +static const char ECXThunkName[] = "__llvm_retpoline_ecx"; +static const char EDXThunkName[] = "__llvm_retpoline_edx"; +static const char EDIThunkName[] = "__llvm_retpoline_edi"; + +namespace { +class X86RetpolineThunks : public MachineFunctionPass { +public: + static char ID; + + X86RetpolineThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "X86 Retpoline Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + } + +private: + MachineModuleInfo *MMI; + const TargetMachine *TM; + bool Is64Bit; + const X86Subtarget *STI; + const X86InstrInfo *TII; + + bool InsertedThunks; + + void createThunkFunction(Module &M, StringRef Name); + void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); + void populateThunk(MachineFunction &MF, Optional Reg = None); +}; + +} // end anonymous namespace + +FunctionPass *llvm::createX86RetpolineThunksPass() { + return new X86RetpolineThunks(); +} + +char X86RetpolineThunks::ID = 0; + +bool X86RetpolineThunks::doInitialization(Module &M) { + InsertedThunks = false; + return false; +} + +bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << getPassName() << '\n'); + + TM = &MF.getTarget();; + STI = &MF.getSubtarget(); + TII = STI->getInstrInfo(); + Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; + + MMI = &getAnalysis(); + Module &M = const_cast(*MMI->getModule()); + + // If this function is not a thunk, check to see if we need to insert + // a thunk. + if (!MF.getName().startswith(ThunkNamePrefix)) { + // If we've already inserted a thunk, nothing else to do. + if (InsertedThunks) + return false; + + // Only add a thunk if one of the functions has the retpoline feature + // enabled in its subtarget, and doesn't enable external thunks. + // FIXME: Conditionalize on indirect calls so we don't emit a thunk when + // nothing will end up calling it. + // FIXME: It's a little silly to look at every function just to enumerate + // the subtargets, but eventually we'll want to look at them for indirect + // calls, so maybe this is OK. + if (!STI->useRetpoline() || STI->useRetpolineExternalThunk()) + return false; + + // Otherwise, we need to insert the thunk. + // WARNING: This is not really a well behaving thing to do in a function + // pass. We extract the module and insert a new function (and machine + // function) directly into the module. + if (Is64Bit) + createThunkFunction(M, R11ThunkName); + else + for (StringRef Name : + {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName}) + createThunkFunction(M, Name); + InsertedThunks = true; + return true; + } + + // If this *is* a thunk function, we need to populate it with the correct MI. + if (Is64Bit) { + assert(MF.getName() == "__llvm_retpoline_r11" && + "Should only have an r11 thunk on 64-bit targets"); + + // __llvm_retpoline_r11: + // callq .Lr11_call_target + // .Lr11_capture_spec: + // pause + // lfence + // jmp .Lr11_capture_spec + // .align 16 + // .Lr11_call_target: + // movq %r11, (%rsp) + // retq + populateThunk(MF, X86::R11); + } else { + // For 32-bit targets we need to emit a collection of thunks for various + // possible scratch registers as well as a fallback that uses EDI, which is + // normally callee saved. + // __llvm_retpoline_eax: + // calll .Leax_call_target + // .Leax_capture_spec: + // pause + // jmp .Leax_capture_spec + // .align 16 + // .Leax_call_target: + // movl %eax, (%esp) # Clobber return addr + // retl + // + // __llvm_retpoline_ecx: + // ... # Same setup + // movl %ecx, (%esp) + // retl + // + // __llvm_retpoline_edx: + // ... # Same setup + // movl %edx, (%esp) + // retl + // + // __llvm_retpoline_edi: + // ... # Same setup + // movl %edi, (%esp) + // retl + if (MF.getName() == EAXThunkName) + populateThunk(MF, X86::EAX); + else if (MF.getName() == ECXThunkName) + populateThunk(MF, X86::ECX); + else if (MF.getName() == EDXThunkName) + populateThunk(MF, X86::EDX); + else if (MF.getName() == EDIThunkName) + populateThunk(MF, X86::EDI); + else + llvm_unreachable("Invalid thunk name on x86-32!"); + } + + return true; +} + +void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) { + assert(Name.startswith(ThunkNamePrefix) && + "Created a thunk with an unexpected prefix!"); + + LLVMContext &Ctx = M.getContext(); + auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = + Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); + F->setVisibility(GlobalValue::HiddenVisibility); + F->setComdat(M.getOrInsertComdat(Name)); + + // Add Attributes so that we don't create a frame, unwind information, or + // inline. + AttrBuilder B; + B.addAttribute(llvm::Attribute::NoUnwind); + B.addAttribute(llvm::Attribute::Naked); + F->addAttributes(llvm::AttributeList::FunctionIndex, B); + + // Populate our function a bit so that we can verify. + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); + IRBuilder<> Builder(Entry); + + Builder.CreateRetVoid(); +} + +void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB, + unsigned Reg) { + const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr; + const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP; + addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0) + .addReg(Reg); +} + +void X86RetpolineThunks::populateThunk(MachineFunction &MF, + Optional Reg) { + // Set MF properties. We never use vregs... + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); + MF.push_back(CaptureSpec); + MF.push_back(CallTarget); + + const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL; + + BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget); + Entry->addSuccessor(CallTarget); + Entry->addSuccessor(CaptureSpec); + CallTarget->setHasAddressTaken(); + + // In the capture loop for speculation, we want to stop the processor from + // speculating as fast as possible. On Intel processors, the PAUSE instruction + // will block speculation without consuming any execution resources. On AMD + // processors, the PAUSE instruction is (essentially) a nop, so we also use an + // LFENCE instruction which they have advised will stop speculation as well + // with minimal resource utilization. We still end the capture with a jump to + // form an infinite loop to fully guarantee that no matter what implementation + // of the x86 ISA, speculating this code path never escapes. + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec); + CaptureSpec->setHasAddressTaken(); + CaptureSpec->addSuccessor(CaptureSpec); + + CallTarget->setAlignment(4); + insertRegReturnAddrClobber(*CallTarget, *Reg); + BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); +} Index: contrib/llvm/lib/Target/X86/X86Subtarget.h =================================================================== --- contrib/llvm/lib/Target/X86/X86Subtarget.h +++ contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -297,6 +297,14 @@ /// Processor supports Cache Line Write Back instruction bool HasCLWB; + /// Use a retpoline thunk rather than indirect calls to block speculative + /// execution. + bool UseRetpoline; + + /// When using a retpoline thunk, call an externally provided thunk rather + /// than emitting one inside the compiler. + bool UseRetpolineExternalThunk; + /// Use software floating point for code generation. bool UseSoftFloat; @@ -506,6 +514,8 @@ bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } + bool useRetpoline() const { return UseRetpoline; } + bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } bool isXRaySupported() const override { return is64Bit(); } @@ -638,6 +648,10 @@ /// This function returns true if the target has sincos() routine in its /// compiler runtime or math libraries. bool hasSinCos() const; + + /// If we are using retpolines, we need to expand indirectbr to avoid it + /// lowering to an actual indirect jump. + bool enableIndirectBrExpand() const override { return useRetpoline(); } /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } Index: contrib/llvm/lib/Target/X86/X86Subtarget.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -315,6 +315,8 @@ HasCLFLUSHOPT = false; HasCLWB = false; IsBTMemSlow = false; + UseRetpoline = false; + UseRetpolineExternalThunk = false; IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; Index: contrib/llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -305,6 +305,7 @@ void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; + void addPreEmitPass2() override; void addPreSched2() override; }; @@ -334,6 +335,11 @@ if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); + + // Add passes that handle indirect branch removal and insertion of a retpoline + // thunk. These will be a no-op unless a function subtarget has the retpoline + // feature enabled. + addPass(createIndirectBrExpandPass()); } bool X86PassConfig::addInstSelector() { @@ -417,4 +423,8 @@ addPass(createX86FixupLEAs()); addPass(createX86EvexToVexInsts()); } +} + +void X86PassConfig::addPreEmitPass2() { + addPass(createX86RetpolineThunksPass()); } Index: contrib/llvm/tools/clang/include/clang/Driver/Options.td =================================================================== --- contrib/llvm/tools/clang/include/clang/Driver/Options.td +++ contrib/llvm/tools/clang/include/clang/Driver/Options.td @@ -2422,6 +2422,11 @@ def mno_hexagon_hvx_double : Flag<["-"], "mno-hvx-double">, Group, Flags<[CC1Option]>, HelpText<"Disable Hexagon Double Vector eXtensions">; +def mretpoline : Flag<["-"], "mretpoline">, Group; +def mno_retpoline : Flag<["-"], "mno-retpoline">, Group; +def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group; +def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group; + // These are legacy user-facing driver-level option spellings. They are always // aliases for options that are spelled using the more common Unix / GNU flag // style of double-dash and equals-joined flags. Index: contrib/llvm/tools/clang/lib/Basic/Targets.cpp =================================================================== --- contrib/llvm/tools/clang/lib/Basic/Targets.cpp +++ contrib/llvm/tools/clang/lib/Basic/Targets.cpp @@ -2691,6 +2691,8 @@ bool HasCLWB = false; bool HasMOVBE = false; bool HasPREFETCHWT1 = false; + bool HasRetpoline = false; + bool HasRetpolineExternalThunk = false; /// \brief Enumeration of all of the X86 CPUs supported by Clang. /// @@ -3821,6 +3823,10 @@ HasPREFETCHWT1 = true; } else if (Feature == "+clzero") { HasCLZERO = true; + } else if (Feature == "+retpoline") { + HasRetpoline = true; + } else if (Feature == "+retpoline-external-thunk") { + HasRetpolineExternalThunk = true; } X86SSEEnum Level = llvm::StringSwitch(Feature) @@ -4285,6 +4291,8 @@ .Case("rdrnd", HasRDRND) .Case("rdseed", HasRDSEED) .Case("rtm", HasRTM) + .Case("retpoline", HasRetpoline) + .Case("retpoline-external-thunk", HasRetpolineExternalThunk) .Case("sgx", HasSGX) .Case("sha", HasSHA) .Case("sse", SSELevel >= SSE1) Index: contrib/llvm/tools/lld/ELF/Arch/X86.cpp =================================================================== --- contrib/llvm/tools/lld/ELF/Arch/X86.cpp +++ contrib/llvm/tools/lld/ELF/Arch/X86.cpp @@ -21,7 +21,7 @@ using namespace lld::elf; namespace { -class X86 final : public TargetInfo { +class X86 : public TargetInfo { public: X86(); RelExpr getRelExpr(uint32_t Type, const SymbolBody &S, @@ -358,7 +358,153 @@ memcpy(Loc - 2, Inst, sizeof(Inst)); } +namespace { +class RetpolinePic : public X86 { +public: + RetpolinePic(); + void writeGotPlt(uint8_t *Buf, const SymbolBody &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; + +class RetpolineNoPic : public X86 { +public: + RetpolineNoPic(); + void writeGotPlt(uint8_t *Buf, const SymbolBody &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; +} // namespace + +RetpolinePic::RetpolinePic() { + PltHeaderSize = 48; + PltEntrySize = 32; +} + +void RetpolinePic::writeGotPlt(uint8_t *Buf, const SymbolBody &S) const { + write32le(Buf, S.getPltVA() + 17); +} + +void RetpolinePic::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xff, 0xb3, 0, 0, 0, 0, // 0: pushl GOTPLT+4(%ebx) + 0x50, // 6: pushl %eax + 0x8b, 0x83, 0, 0, 0, 0, // 7: mov GOTPLT+8(%ebx), %eax + 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: call next + 0xf3, 0x90, // 12: loop: pause + 0x0f, 0xae, 0xe8, // 14: lfence + 0xeb, 0xf9, // 17: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16 + 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp) + 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx + 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp) + 0x89, 0xc8, // 2b: mov %ecx, %eax + 0x59, // 2d: pop %ecx + 0xc3, // 2e: ret + 0xcc, // 2f: int3 + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltHeaderSize); + + uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize(); + uint32_t GotPlt = InX::GotPlt->getVA() - Ebx; + write32le(Buf + 2, GotPlt + 4); + write32le(Buf + 9, GotPlt + 8); +} + +void RetpolinePic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x50, // pushl %eax + 0x8b, 0x83, 0, 0, 0, 0, // mov foo@GOT(%ebx), %eax + 0xe8, 0, 0, 0, 0, // call plt+0x20 + 0xe9, 0, 0, 0, 0, // jmp plt+0x12 + 0x68, 0, 0, 0, 0, // pushl $reloc_offset + 0xe9, 0, 0, 0, 0, // jmp plt+0 + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltEntrySize); + + uint32_t Ebx = InX::Got->getVA() + InX::Got->getSize(); + write32le(Buf + 3, GotPltEntryAddr - Ebx); + write32le(Buf + 8, -Index * PltEntrySize - PltHeaderSize - 12 + 32); + write32le(Buf + 13, -Index * PltEntrySize - PltHeaderSize - 17 + 18); + write32le(Buf + 18, RelOff); + write32le(Buf + 23, -Index * PltEntrySize - PltHeaderSize - 27); +} + +RetpolineNoPic::RetpolineNoPic() { + PltHeaderSize = 48; + PltEntrySize = 32; +} + +void RetpolineNoPic::writeGotPlt(uint8_t *Buf, const SymbolBody &S) const { + write32le(Buf, S.getPltVA() + 16); +} + +void RetpolineNoPic::writePltHeader(uint8_t *Buf) const { + const uint8_t PltData[] = { + 0xff, 0x35, 0, 0, 0, 0, // 0: pushl GOTPLT+4 + 0x50, // 6: pushl %eax + 0xa1, 0, 0, 0, 0, // 7: mov GOTPLT+8, %eax + 0xe8, 0x0f, 0x00, 0x00, 0x00, // c: call next + 0xf3, 0x90, // 11: loop: pause + 0x0f, 0xae, 0xe8, // 13: lfence + 0xeb, 0xf9, // 16: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 18: int3 + 0xcc, 0xcc, 0xcc, // 1f: int3; .align 16 + 0x89, 0x0c, 0x24, // 20: next: mov %ecx, (%esp) + 0x8b, 0x4c, 0x24, 0x04, // 23: mov 0x4(%esp), %ecx + 0x89, 0x44, 0x24, 0x04, // 27: mov %eax ,0x4(%esp) + 0x89, 0xc8, // 2b: mov %ecx, %eax + 0x59, // 2d: pop %ecx + 0xc3, // 2e: ret + 0xcc, // 2f: int3 + }; + memcpy(Buf, PltData, sizeof(PltData)); + assert(sizeof(PltData) == TargetInfo::PltHeaderSize); + + uint32_t GotPlt = InX::GotPlt->getVA(); + write32le(Buf + 2, GotPlt + 4); + write32le(Buf + 8, GotPlt + 8); +} + +void RetpolineNoPic::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x50, // 0: pushl %eax + 0xa1, 0, 0, 0, 0, // 1: mov foo_in_GOT, %eax + 0xe8, 0, 0, 0, 0, // 6: call plt+0x20 + 0xe9, 0, 0, 0, 0, // b: jmp plt+0x11 + 0x68, 0, 0, 0, 0, // 10: pushl $reloc_offset + 0xe9, 0, 0, 0, 0, // 15: jmp plt+0 + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltEntrySize); + + write32le(Buf + 2, GotPltEntryAddr); + write32le(Buf + 7, -Index * PltEntrySize - PltHeaderSize - 11 + 32); + write32le(Buf + 12, -Index * PltEntrySize - PltHeaderSize - 16 + 17); + write32le(Buf + 17, RelOff); + write32le(Buf + 22, -Index * PltEntrySize - PltHeaderSize - 26); +} + TargetInfo *elf::getX86TargetInfo() { - static X86 Target; - return &Target; + if (Config->ZRetpolineplt) { + if (Config->Pic) { + static RetpolinePic T; + return &T; + } + static RetpolineNoPic T; + return &T; + } + + static X86 T; + return &T; } Index: contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp =================================================================== --- contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp +++ contrib/llvm/tools/lld/ELF/Arch/X86_64.cpp @@ -23,7 +23,7 @@ using namespace lld::elf; namespace { -template class X86_64 final : public TargetInfo { +template class X86_64 : public TargetInfo { public: X86_64(); RelExpr getRelExpr(uint32_t Type, const SymbolBody &S, @@ -462,12 +462,136 @@ write32le(Loc - 1, Val + 1); } -TargetInfo *elf::getX32TargetInfo() { - static X86_64 Target; - return &Target; +namespace { +template class Retpoline : public X86_64 { +public: + Retpoline(); + void writeGotPlt(uint8_t *Buf, const SymbolBody &S) const override; + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; + +template class RetpolineZNow : public X86_64 { +public: + RetpolineZNow(); + void writeGotPlt(uint8_t *Buf, const SymbolBody &S) const override {} + void writePltHeader(uint8_t *Buf) const override; + void writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, uint64_t PltEntryAddr, + int32_t Index, unsigned RelOff) const override; +}; +} // namespace + +template Retpoline::Retpoline() { + TargetInfo::PltHeaderSize = 48; + TargetInfo::PltEntrySize = 32; } -TargetInfo *elf::getX86_64TargetInfo() { - static X86_64 Target; - return &Target; +template +void Retpoline::writeGotPlt(uint8_t *Buf, const SymbolBody &S) const { + write32le(Buf, S.getPltVA() + 17); } + +template void Retpoline::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xff, 0x35, 0, 0, 0, 0, // 0: pushq GOTPLT+8(%rip) + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 6: mov GOTPLT+16(%rip), %r11 + 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: callq next + 0xf3, 0x90, // 12: loop: pause + 0x0f, 0xae, 0xe8, // 14: lfence + 0xeb, 0xf9, // 17: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16 + 0x4c, 0x89, 0x1c, 0x24, // 20: next: mov %r11, (%rsp) + 0xc3, // 24: ret + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 25: int3; .align 16 + 0xcc, 0xcc, 0xcc, 0xcc, + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltHeaderSize); + + uint64_t GotPlt = InX::GotPlt->getVA(); + uint64_t Plt = InX::Plt->getVA(); + write32le(Buf + 2, GotPlt - Plt - 6 + 8); + write32le(Buf + 9, GotPlt - Plt - 13 + 16); +} + +template +void Retpoline::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0: mov foo@GOTPLT(%rip), %r11 + 0xe8, 0, 0, 0, 0, // 7: callq plt+0x20 + 0xe9, 0, 0, 0, 0, // c: jmp plt+0x12 + 0x68, 0, 0, 0, 0, // 11: pushq + 0xe9, 0, 0, 0, 0, // 16: jmp plt+0 + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // int3; .align 16 + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltEntrySize); + + uint64_t Off = TargetInfo::PltHeaderSize + TargetInfo::PltEntrySize * Index; + + write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7); + write32le(Buf + 8, -Off - 12 + 32); + write32le(Buf + 13, -Off - 17 + 18); + write32le(Buf + 18, Index); + write32le(Buf + 23, -Off - 27); +} + +template RetpolineZNow::RetpolineZNow() { + TargetInfo::PltHeaderSize = 32; + TargetInfo::PltEntrySize = 16; +} + +template +void RetpolineZNow::writePltHeader(uint8_t *Buf) const { + const uint8_t Insn[] = { + 0xe8, 0x0b, 0x00, 0x00, 0x00, // 0: call next + 0xf3, 0x90, // 5: loop: pause + 0x0f, 0xae, 0xe8, // 7: lfence + 0xeb, 0xf9, // a: jmp loop + 0xcc, 0xcc, 0xcc, 0xcc, // c: int3; .align 16 + 0x4c, 0x89, 0x1c, 0x24, // 10: next: mov %r11, (%rsp) + 0xc3, // 14: ret + 0xcc, // 15: int3; .align 16 + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltHeaderSize); +} + +template +void RetpolineZNow::writePlt(uint8_t *Buf, uint64_t GotPltEntryAddr, + uint64_t PltEntryAddr, int32_t Index, + unsigned RelOff) const { + const uint8_t Insn[] = { + 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // mov foo@GOTPLT(%rip), %r11 + 0xe9, 0, 0, 0, 0, // jmp plt+0 + 0xcc, 0xcc, 0xcc, 0xcc, // int3; .align 16 + }; + memcpy(Buf, Insn, sizeof(Insn)); + assert(sizeof(Insn) == TargetInfo::PltEntrySize); + + write32le(Buf + 3, GotPltEntryAddr - PltEntryAddr - 7); + write32le(Buf + 8, + -Index * TargetInfo::PltEntrySize - TargetInfo::PltHeaderSize - 12); +} + +template TargetInfo *getTargetInfo() { + if (Config->ZRetpolineplt) { + if (Config->ZNow) { + static RetpolineZNow T; + return &T; + } + static Retpoline T; + return &T; + } + + static X86_64 T; + return &T; +} + +TargetInfo *elf::getX32TargetInfo() { return getTargetInfo(); } +TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo(); } Index: contrib/llvm/tools/lld/ELF/Config.h =================================================================== --- contrib/llvm/tools/lld/ELF/Config.h +++ contrib/llvm/tools/lld/ELF/Config.h @@ -156,6 +156,7 @@ bool ZRelro; bool ZRodynamic; bool ZText; + bool ZRetpolineplt; bool ExitEarly; bool ZWxneeded; DiscardPolicy Discard; Index: contrib/llvm/tools/lld/ELF/Driver.cpp =================================================================== --- contrib/llvm/tools/lld/ELF/Driver.cpp +++ contrib/llvm/tools/lld/ELF/Driver.cpp @@ -690,6 +690,7 @@ Config->ZNow = hasZOption(Args, "now"); Config->ZOrigin = hasZOption(Args, "origin"); Config->ZRelro = !hasZOption(Args, "norelro"); + Config->ZRetpolineplt = hasZOption(Args, "retpolineplt"); Config->ZRodynamic = hasZOption(Args, "rodynamic"); Config->ZStackSize = getZOptionValue(Args, "stack-size", 0); Config->ZText = !hasZOption(Args, "notext"); Index: contrib/llvm/tools/opt/opt.cpp =================================================================== --- contrib/llvm/tools/opt/opt.cpp +++ contrib/llvm/tools/opt/opt.cpp @@ -401,6 +401,7 @@ initializeSjLjEHPreparePass(Registry); initializePreISelIntrinsicLoweringLegacyPassPass(Registry); initializeGlobalMergePass(Registry); + initializeIndirectBrExpandPassPass(Registry); initializeInterleavedAccessPass(Registry); initializeCountingFunctionInserterPass(Registry); initializeUnreachableBlockElimLegacyPassPass(Registry); Index: lib/clang/freebsd_cc_version.h =================================================================== --- lib/clang/freebsd_cc_version.h +++ lib/clang/freebsd_cc_version.h @@ -1,3 +1,3 @@ /* $FreeBSD$ */ -#define FREEBSD_CC_VERSION 1100505 +#define FREEBSD_CC_VERSION 1100506 Index: lib/clang/libllvm/Makefile =================================================================== --- lib/clang/libllvm/Makefile +++ lib/clang/libllvm/Makefile @@ -182,6 +182,7 @@ SRCS_MIN+= CodeGen/GlobalMerge.cpp SRCS_MIN+= CodeGen/IfConversion.cpp SRCS_MIN+= CodeGen/ImplicitNullChecks.cpp +SRCS_MIN+= CodeGen/IndirectBrExpandPass.cpp SRCS_MIN+= CodeGen/InlineSpiller.cpp SRCS_MIN+= CodeGen/InterferenceCache.cpp SRCS_MIN+= CodeGen/InterleavedAccessPass.cpp @@ -1037,6 +1038,7 @@ SRCS_MIN+= Target/X86/X86PadShortFunction.cpp SRCS_MIN+= Target/X86/X86RegisterBankInfo.cpp SRCS_MIN+= Target/X86/X86RegisterInfo.cpp +SRCS_MIN+= Target/X86/X86RetpolineThunks.cpp SRCS_MIN+= Target/X86/X86SelectionDAGInfo.cpp SRCS_MIN+= Target/X86/X86ShuffleDecodeConstantPool.cpp SRCS_MIN+= Target/X86/X86Subtarget.cpp Index: sys/sys/param.h =================================================================== --- sys/sys/param.h +++ sys/sys/param.h @@ -60,7 +60,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1101511 /* Master, propagated to newvers */ +#define __FreeBSD_version 1101512 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,