diff --git a/sys/cddl/dev/dtrace/dtrace_cddl.h b/sys/cddl/dev/dtrace/dtrace_cddl.h --- a/sys/cddl/dev/dtrace/dtrace_cddl.h +++ b/sys/cddl/dev/dtrace/dtrace_cddl.h @@ -88,6 +88,7 @@ void *td_systrace_args; /* syscall probe arguments. */ uint64_t td_fasttrap_tp_gen; /* Tracepoint hash table gen. */ struct trapframe *td_dtrace_trapframe; /* Trap frame from invop. */ + void *td_kinst; } kdtrace_thread_t; /* @@ -117,6 +118,7 @@ #define t_dtrace_systrace_args td_dtrace->td_systrace_args #define t_fasttrap_tp_gen td_dtrace->td_fasttrap_tp_gen #define t_dtrace_trapframe td_dtrace->td_dtrace_trapframe +#define t_kinst td_dtrace->td_kinst #define p_dtrace_helpers p_dtrace->p_dtrace_helpers #define p_dtrace_count p_dtrace->p_dtrace_count #define p_dtrace_probes p_dtrace->p_dtrace_probes diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.h b/sys/cddl/dev/kinst/amd64/kinst_isa.h new file mode 100644 --- /dev/null +++ b/sys/cddl/dev/kinst/amd64/kinst_isa.h @@ -0,0 +1,45 @@ +/* + * SPDX-License-Identifier: CDDL 1.0 + * + * Copyright 2022 Christos Margiolis + * Copyright 2022 Mark Johnston + */ + +#ifndef _KINST_ISA_H_ +#define _KINST_ISA_H_ + +#include + +#define KINST_PATCHVAL 0xcc + +/* + * Each trampoline is 32 bytes long and contains [instruction, jmp]. Since we + * have 2 instructions stored in the trampoline, and each of them can take up + * to 16 bytes, 32 bytes is enough to cover even the worst case scenario. + */ +#define KINST_TRAMP_SIZE 32 +#define KINST_TRAMPCHUNK_SIZE PAGE_SIZE + +/* + * Fill the trampolines with breakpoint instructions so that the kernel will + * crash cleanly if things somehow go wrong. + */ +#define KINST_TRAMP_INIT(t, s) memset((t), KINST_PATCHVAL, (s)) + +typedef uint8_t kinst_patchval_t; + +struct kinst_probe_md { + int flags; + int instlen; /* original instr len */ + int tinstlen; /* trampoline instr len */ + uint8_t template[16]; /* copied into thread tramps */ + int dispoff; /* offset of rip displacement */ + + /* operands to "call" instruction branch target */ + int reg1; + int reg2; + int scale; + int64_t disp; +}; + +#endif /* _KINST_ISA_H_ */ diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.c b/sys/cddl/dev/kinst/amd64/kinst_isa.c new file mode 100644 --- /dev/null +++ b/sys/cddl/dev/kinst/amd64/kinst_isa.c @@ -0,0 +1,550 @@ +/* + * SPDX-License-Identifier: CDDL 1.0 + * + * Copyright 2022 Christos Margiolis + * Copyright 2022 Mark Johnston + */ + +#include + +#include +#include + +#include +#include +#include + +#include "kinst.h" + +#define KINST_PUSHL_RBP 0x55 +#define KINST_STI 0xfb +#define KINST_POPF 0x9d + +#define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6) +#define KINST_MODRM_REG(b) (((b) & 0x38) >> 3) +#define KINST_MODRM_RM(b) ((b) & 0x07) + +#define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6) +#define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3) +#define KINST_SIB_BASE(s) (((s) & 0x07) >> 0) + +#define KINST_REX_W(r) (((r) & 0x08) >> 3) +#define KINST_REX_R(r) (((r) & 0x04) >> 2) +#define KINST_REX_X(r) (((r) & 0x02) >> 1) +#define KINST_REX_B(r) (((r) & 0x01) >> 0) + +#define KINST_F_CALL 0x0001 /* instruction is a "call" */ +#define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */ +#define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */ +#define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */ +#define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */ + +/* + * Map ModR/M register bits to a trapframe offset. + */ +static int +kinst_regoff(int reg) +{ +#define _MATCH_REG(i, reg) \ + case i: \ + return (offsetof(struct trapframe, tf_ ## reg) / \ + sizeof(register_t)) + switch (reg) { + _MATCH_REG( 0, rax); + _MATCH_REG( 1, rcx); + _MATCH_REG( 2, rdx); + _MATCH_REG( 3, rbx); + _MATCH_REG( 4, rsp); /* SIB when mod != 3 */ + _MATCH_REG( 5, rbp); + _MATCH_REG( 6, rsi); + _MATCH_REG( 7, rdi); + _MATCH_REG( 8, r8); /* REX.R is set */ + _MATCH_REG( 9, r9); + _MATCH_REG(10, r10); + _MATCH_REG(11, r11); + _MATCH_REG(12, r12); + _MATCH_REG(13, r13); + _MATCH_REG(14, r14); + _MATCH_REG(15, r15); + } +#undef _MATCH_REG + panic("%s: unhandled register index %d", __func__, reg); +} + +/* + * Obtain the specified register's value. + */ +static uint64_t +kinst_regval(struct trapframe *frame, int reg) +{ + if (reg == -1) + return (0); + return (((register_t *)frame)[kinst_regoff(reg)]); +} + +static uint32_t +kinst_riprel_disp(struct kinst_probe *kp, void *dst) +{ + return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp - + (intptr_t)dst)); +} + +static void +kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp) +{ + uint8_t *instr; + uint32_t disp; + int ilen; + + ilen = kp->kp_md.tinstlen; + + memcpy(tramp, kp->kp_md.template, ilen); + if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) { + disp = kinst_riprel_disp(kp, tramp); + memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t)); + } + + /* + * The following position-independent jmp takes us back to the + * original code. It is encoded as "jmp *0(%rip)" (six bytes), + * followed by the absolute address of the instruction following + * the one that was traced (eight bytes). + */ + tramp[ilen + 0] = 0xff; + tramp[ilen + 1] = 0x25; + tramp[ilen + 2] = 0x00; + tramp[ilen + 3] = 0x00; + tramp[ilen + 4] = 0x00; + tramp[ilen + 5] = 0x00; + instr = kp->kp_patchpoint + kp->kp_md.instlen; + memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t)); +} + +int +kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch) +{ + solaris_cpu_t *cpu; + uintptr_t *stack, retaddr; + struct kinst_probe *kp; + struct kinst_probe_md *kpmd; + uint8_t *tramp; + + stack = (uintptr_t *)frame->tf_rsp; + cpu = &solaris_cpu[curcpu]; + + LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) { + if ((uintptr_t)kp->kp_patchpoint == addr) + break; + } + if (kp == NULL) + return (0); + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + cpu->cpu_dtrace_caller = stack[0]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR); + dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0); + cpu->cpu_dtrace_caller = 0; + + kpmd = &kp->kp_md; + if ((kpmd->flags & KINST_F_CALL) != 0) { + /* + * dtrace_invop_start() reserves space on the stack to + * store the return address of the call instruction. + */ + retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen); + *(uintptr_t *)scratch = retaddr; + + if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) { + frame->tf_rip = (uintptr_t)(kp->kp_patchpoint + + kpmd->disp + kpmd->instlen); + } else { + register_t rval; + + if (kpmd->reg1 == -1 && kpmd->reg2 == -1) { + /* rip-relative */ + rval = frame->tf_rip - 1 + kpmd->instlen; + } else { + /* indirect */ + rval = kinst_regval(frame, kpmd->reg1) + + (kinst_regval(frame, kpmd->reg2) << + kpmd->scale); + } + + if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) { + frame->tf_rip = rval + kpmd->disp; + } else { + frame->tf_rip = + *(uintptr_t *)(rval + kpmd->disp); + } + } + return (DTRACE_INVOP_CALL); + } else { + tramp = curthread->t_kinst; + if (tramp == NULL) { + /* + * A trampoline allocation failed, so this probe is + * effectively disabled. Restore the original + * instruction. + * + * We can't safely print anything here, but the + * trampoline allocator should have left a breadcrumb in + * the dmesg. + */ + kinst_patch_tracepoint(kp, kp->kp_savedval); + frame->tf_rip = (register_t)kp->kp_patchpoint; + } else { + kinst_trampoline_populate(kp, tramp); + frame->tf_rip = (register_t)tramp; + } + return (DTRACE_INVOP_NOP); + } +} + +void +kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val) +{ + register_t reg; + int oldwp; + + reg = intr_disable(); + oldwp = disable_wp(); + *kp->kp_patchpoint = val; + restore_wp(oldwp); + intr_restore(reg); +} + +static void +kinst_set_disp8(struct kinst_probe *kp, uint8_t byte) +{ + kp->kp_md.disp = (int64_t)(int8_t)byte; +} + +static void +kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes) +{ + int32_t disp32; + + memcpy(&disp32, bytes, sizeof(disp32)); + kp->kp_md.disp = (int64_t)disp32; +} + +static int +kinst_dis_get_byte(void *p) +{ + int ret; + uint8_t **instr = p; + + ret = **instr; + (*instr)++; + + return (ret); +} + +/* + * Set up all of the state needed to faithfully execute a probed instruction. + * + * In the simple case, we copy the instruction unmodified to a per-thread + * trampoline, wherein it is followed by a jump back to the original code. + * - Instructions can have %rip as an operand: + * - with %rip-relative addressing encoded in ModR/M, or + * - implicitly as a part of the instruction definition (jmp, call). + * - Call instructions (which may be %rip-relative) need to push the correct + * return address onto the stack. + * + * Call instructions are simple enough to be emulated in software, so we simply + * do not use the trampoline mechanism in that case. kinst_invop() will compute + * the branch target using the address info computed here (register operands and + * displacement). + * + * %rip-relative operands encoded using the ModR/M byte always use a 32-bit + * displacement; when populating the trampoline the displacement is adjusted to + * be relative to the trampoline address. Trampolines are always allocated + * above KERNBASE for this reason. + * + * For other %rip-relative operands (just jumps) we take the same approach. + * Instructions which specify an 8-bit displacement must be rewritten to use a + * 32-bit displacement. + */ +static int +kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr) +{ + struct kinst_probe_md *kpmd; + dis86_t d86; + uint8_t *bytes, modrm, rex; + int dispoff, i, ilen, opcidx; + + kpmd = &kp->kp_md; + + d86.d86_data = instr; + d86.d86_get_byte = kinst_dis_get_byte; + d86.d86_check_func = NULL; + if (dtrace_disx86(&d86, SIZE64) != 0) { + KINST_LOG("failed to disassemble instruction at: %p", *instr); + return (EINVAL); + } + bytes = d86.d86_bytes; + kpmd->instlen = kpmd->tinstlen = d86.d86_len; + + /* + * Skip over prefixes, save REX. + */ + rex = 0; + for (i = 0; i < kpmd->instlen; i++) { + switch (bytes[i]) { + case 0xf0 ... 0xf3: + /* group 1 */ + continue; + case 0x26: + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + /* group 2 */ + continue; + case 0x66: + /* group 3 */ + continue; + case 0x67: + /* group 4 */ + continue; + case 0x40 ... 0x4f: + /* REX */ + rex = bytes[i]; + continue; + } + break; + } + KASSERT(i < kpmd->instlen, + ("%s: failed to disassemble instruction at %p", __func__, bytes)); + opcidx = i; + + /* + * Identify instructions of interest by opcode: calls and jumps. + * Extract displacements. + */ + dispoff = -1; + switch (bytes[opcidx]) { + case 0x0f: + switch (bytes[opcidx + 1]) { + case 0x80 ... 0x8f: + /* conditional jmp near */ + kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; + dispoff = opcidx + 2; + kinst_set_disp32(kp, &bytes[dispoff]); + break; + } + break; + case 0xe3: + /* + * There is no straightforward way to translate this instruction + * to use a 32-bit displacement. Fortunately, it is rarely + * used. + */ + return (EINVAL); + case 0x70 ... 0x7f: + /* conditional jmp short */ + kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; + dispoff = opcidx + 1; + kinst_set_disp8(kp, bytes[dispoff]); + break; + case 0xe9: + /* unconditional jmp near */ + kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; + dispoff = opcidx + 1; + kinst_set_disp32(kp, &bytes[dispoff]); + break; + case 0xeb: + /* unconditional jmp short */ + kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL; + dispoff = opcidx + 1; + kinst_set_disp8(kp, bytes[dispoff]); + break; + case 0xe8: + case 0x9a: + /* direct call */ + kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL; + dispoff = opcidx + 1; + kinst_set_disp32(kp, &bytes[dispoff]); + break; + case 0xff: + KASSERT(d86.d86_got_modrm, + ("no ModR/M byte for instr at %p", *instr - kpmd->instlen)); + switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) { + case 0x02: + case 0x03: + /* indirect call */ + kpmd->flags |= KINST_F_CALL; + break; + case 0x04: + case 0x05: + /* indirect jump */ + kpmd->flags |= KINST_F_JMP; + break; + } + } + + /* + * If there's a ModR/M byte, we need to check it to see if the operand + * is %rip-relative, and rewrite the displacement if so. If not, we + * might still have to extract operand info if this is a call + * instruction. + */ + if (d86.d86_got_modrm) { + uint8_t mod, rm, sib; + + kpmd->reg1 = kpmd->reg2 = -1; + + modrm = bytes[d86.d86_rmindex]; + mod = KINST_MODRM_MOD(modrm); + rm = KINST_MODRM_RM(modrm); + if (mod == 0 && rm == 5) { + kpmd->flags |= KINST_F_RIPREL; + dispoff = d86.d86_rmindex + 1; + kinst_set_disp32(kp, &bytes[dispoff]); + } else if ((kpmd->flags & KINST_F_CALL) != 0) { + bool havesib; + + havesib = (mod != 3 && rm == 4); + dispoff = d86.d86_rmindex + (havesib ? 2 : 1); + if (mod == 1) + kinst_set_disp8(kp, bytes[dispoff]); + else if (mod == 2) + kinst_set_disp32(kp, &bytes[dispoff]); + else if (mod == 3) + kpmd->flags |= KINST_F_MOD_DIRECT; + + if (havesib) { + sib = bytes[d86.d86_rmindex + 1]; + if (KINST_SIB_BASE(sib) != 5) { + kpmd->reg1 = KINST_SIB_BASE(sib) | + (KINST_REX_B(rex) << 3); + } + kpmd->scale = KINST_SIB_SCALE(sib); + kpmd->reg2 = KINST_SIB_INDEX(sib) | + (KINST_REX_X(rex) << 3); + } else { + kpmd->reg1 = rm | (KINST_REX_B(rex) << 3); + } + } + } + + /* + * Calls are emulated in software; once operands are decoded we have + * nothing else to do. + */ + if ((kpmd->flags & KINST_F_CALL) != 0) + return (0); + + /* + * Allocate and populate an instruction trampoline template. + * + * Position-independent instructions can simply be copied, but + * position-dependent instructions require some surgery: jump + * instructions with an 8-bit displacement need to be converted to use a + * 32-bit displacement, and the adjusted displacement needs to be + * computed. + */ + ilen = kpmd->instlen; + if ((kpmd->flags & KINST_F_RIPREL) != 0) { + if ((kpmd->flags & KINST_F_JMP) == 0 || + bytes[opcidx] == 0x0f || + bytes[opcidx] == 0xe9 || + bytes[opcidx] == 0xff) { + memcpy(kpmd->template, bytes, dispoff); + memcpy(&kpmd->template[dispoff + 4], + &bytes[dispoff + 4], ilen - (dispoff + 4)); + kpmd->dispoff = dispoff; + } else if (bytes[opcidx] == 0xeb) { + memcpy(kpmd->template, bytes, opcidx); + kpmd->template[opcidx] = 0xe9; + kpmd->dispoff = opcidx + 1; + + /* Instruction length changes from 2 to 5. */ + kpmd->tinstlen = 5; + kpmd->disp -= 3; + } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) { + memcpy(kpmd->template, bytes, opcidx); + kpmd->template[opcidx] = 0x0f; + kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10; + kpmd->dispoff = opcidx + 2; + + /* Instruction length changes from 2 to 6. */ + kpmd->tinstlen = 6; + kpmd->disp -= 4; + } else { + panic("unhandled opcode %#x", bytes[opcidx]); + } + } else { + memcpy(kpmd->template, bytes, ilen); + } + + return (0); +} + +int +kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval, + void *opaque) +{ + struct kinst_probe *kp; + dtrace_kinst_probedesc_t *pd; + const char *func; + int error, n, off; + uint8_t *instr, *limit; + + pd = opaque; + func = symval->name; + if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0) + return (0); + + instr = (uint8_t *)symval->value; + limit = (uint8_t *)symval->value + symval->size; + if (instr >= limit) + return (0); + + /* + * Ignore functions not beginning with the usual function prologue. + * These might correspond to assembly routines with which we should not + * meddle. + */ + if (*instr != KINST_PUSHL_RBP) + return (0); + + n = 0; + while (instr < limit) { + off = (int)(instr - (uint8_t *)symval->value); + if (pd->kpd_off != -1 && off != pd->kpd_off) { + instr += dtrace_instr_size(instr); + continue; + } + + /* + * Prevent separate dtrace(1) instances from creating copies of + * the same probe. + */ + LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) { + if (strcmp(kp->kp_func, func) == 0 && + strtol(kp->kp_name, NULL, 10) == off) + return (0); + } + if (++n > KINST_PROBETAB_MAX) { + KINST_LOG("probe list full: %d entries", n); + return (ENOMEM); + } + kp = malloc(sizeof(struct kinst_probe), M_KINST, + M_WAITOK | M_ZERO); + kp->kp_func = func; + snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off); + kp->kp_savedval = *instr; + kp->kp_patchval = KINST_PATCHVAL; + kp->kp_patchpoint = instr; + + error = kinst_instr_dissect(kp, &instr); + if (error != 0) + return (error); + + kinst_probe_create(kp, lf); + } + + return (0); +} diff --git a/sys/cddl/dev/kinst/kinst.h b/sys/cddl/dev/kinst/kinst.h new file mode 100644 --- /dev/null +++ b/sys/cddl/dev/kinst/kinst.h @@ -0,0 +1,71 @@ +/* + * SPDX-License-Identifier: CDDL 1.0 + * + * Copyright 2022 Christos Margiolis + */ + +#ifndef _KINST_H_ +#define _KINST_H_ + +#include + +typedef struct { + char kpd_func[DTRACE_FUNCNAMELEN]; + char kpd_mod[DTRACE_MODNAMELEN]; + int kpd_off; +} dtrace_kinst_probedesc_t; + +#define KINSTIOC_MAKEPROBE _IOW('k', 1, dtrace_kinst_probedesc_t) + +#ifdef _KERNEL + +#include + +#include "kinst_isa.h" + +struct kinst_probe { + LIST_ENTRY(kinst_probe) kp_hashnext; + const char *kp_func; + char kp_name[16]; + dtrace_id_t kp_id; + kinst_patchval_t kp_patchval; + kinst_patchval_t kp_savedval; + kinst_patchval_t *kp_patchpoint; + + struct kinst_probe_md kp_md; +}; + +LIST_HEAD(kinst_probe_list, kinst_probe); + +extern struct kinst_probe_list *kinst_probetab; + +#define KINST_PROBETAB_MAX 0x8000 /* 32k */ +#define KINST_ADDR2NDX(addr) (((uintptr_t)(addr)) & (KINST_PROBETAB_MAX - 1)) +#define KINST_GETPROBE(i) (&kinst_probetab[KINST_ADDR2NDX(i)]) + +struct linker_file; +struct linker_symval; + +int kinst_invop(uintptr_t, struct trapframe *, uintptr_t); +int kinst_make_probe(struct linker_file *, int, struct linker_symval *, + void *); +void kinst_patch_tracepoint(struct kinst_probe *, kinst_patchval_t); +void kinst_probe_create(struct kinst_probe *, struct linker_file *); + +int kinst_trampoline_init(void); +int kinst_trampoline_deinit(void); +uint8_t *kinst_trampoline_alloc(int); +void kinst_trampoline_dealloc(uint8_t *); + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_KINST); +#endif /* MALLOC_DECLARE */ + +#define KINST_LOG_HELPER(fmt, ...) \ + printf("%s:%d: " fmt "%s\n", __func__, __LINE__, __VA_ARGS__) +#define KINST_LOG(...) \ + KINST_LOG_HELPER(__VA_ARGS__, "") + +#endif /* _KERNEL */ + +#endif /* _KINST_H_ */ diff --git a/sys/cddl/dev/kinst/kinst.c b/sys/cddl/dev/kinst/kinst.c new file mode 100644 --- /dev/null +++ b/sys/cddl/dev/kinst/kinst.c @@ -0,0 +1,233 @@ +/* + * SPDX-License-Identifier: CDDL 1.0 + * + * Copyright 2022 Christos Margiolis + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "kinst.h" + +MALLOC_DEFINE(M_KINST, "kinst", "Kernel Instruction Tracing"); + +static d_open_t kinst_open; +static d_close_t kinst_close; +static d_ioctl_t kinst_ioctl; + +static void kinst_provide_module(void *, modctl_t *); +static void kinst_getargdesc(void *, dtrace_id_t, void *, + dtrace_argdesc_t *); +static void kinst_destroy(void *, dtrace_id_t, void *); +static void kinst_enable(void *, dtrace_id_t, void *); +static void kinst_disable(void *, dtrace_id_t, void *); +static int kinst_load(void *); +static int kinst_unload(void *); +static int kinst_modevent(module_t, int, void *); + +static dtrace_pattr_t kinst_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t kinst_pops = { + .dtps_provide = NULL, + .dtps_provide_module = kinst_provide_module, + .dtps_enable = kinst_enable, + .dtps_disable = kinst_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = kinst_getargdesc, + .dtps_getargval = NULL, + .dtps_usermode = NULL, + .dtps_destroy = kinst_destroy +}; + +static struct cdevsw kinst_cdevsw = { + .d_name = "kinst", + .d_version = D_VERSION, + .d_flags = D_TRACKCLOSE, + .d_open = kinst_open, + .d_close = kinst_close, + .d_ioctl = kinst_ioctl, +}; + +static dtrace_provider_id_t kinst_id; +struct kinst_probe_list *kinst_probetab; +static struct cdev *kinst_cdev; + +void +kinst_probe_create(struct kinst_probe *kp, linker_file_t lf) +{ + kp->kp_id = dtrace_probe_create(kinst_id, lf->filename, + kp->kp_func, kp->kp_name, 3, kp); + + LIST_INSERT_HEAD(KINST_GETPROBE(kp->kp_patchpoint), kp, kp_hashnext); +} + +static int +kinst_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, + struct thread *td __unused) +{ + return (0); +} + +static int +kinst_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused, + struct thread *td __unused) +{ + dtrace_condense(kinst_id); + return (0); +} + +static int +kinst_linker_file_cb(linker_file_t lf, void *arg) +{ + dtrace_kinst_probedesc_t *pd; + + pd = arg; + if (pd->kpd_mod[0] != '\0' && strcmp(pd->kpd_mod, lf->filename) != 0) + return (0); + + /* + * Invoke kinst_make_probe_function() once for each function symbol in + * the module "lf". + */ + return (linker_file_function_listall(lf, kinst_make_probe, arg)); +} + +static int +kinst_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr, + int flags __unused, struct thread *td __unused) +{ + dtrace_kinst_probedesc_t *pd; + int error = 0; + + switch (cmd) { + case KINSTIOC_MAKEPROBE: + pd = (dtrace_kinst_probedesc_t *)addr; + pd->kpd_func[sizeof(pd->kpd_func) - 1] = '\0'; + pd->kpd_mod[sizeof(pd->kpd_mod) - 1] = '\0'; + + /* Loop over all functions in the kernel and loaded modules. */ + error = linker_file_foreach(kinst_linker_file_cb, pd); + break; + default: + error = ENOTTY; + break; + } + + return (error); +} + +static void +kinst_provide_module(void *arg, modctl_t *lf) +{ +} + +static void +kinst_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) +{ + desc->dtargd_ndx = DTRACE_ARGNONE; +} + +static void +kinst_destroy(void *arg, dtrace_id_t id, void *parg) +{ + struct kinst_probe *kp = parg; + + LIST_REMOVE(kp, kp_hashnext); + free(kp, M_KINST); +} + +static void +kinst_enable(void *arg, dtrace_id_t id, void *parg) +{ + struct kinst_probe *kp = parg; + + kinst_patch_tracepoint(kp, kp->kp_patchval); +} + +static void +kinst_disable(void *arg, dtrace_id_t id, void *parg) +{ + struct kinst_probe *kp = parg; + + kinst_patch_tracepoint(kp, kp->kp_savedval); +} + +static int +kinst_load(void *dummy) +{ + int error; + + error = kinst_trampoline_init(); + if (error != 0) + return (error); + + error = dtrace_register("kinst", &kinst_attr, DTRACE_PRIV_USER, NULL, + &kinst_pops, NULL, &kinst_id); + if (error != 0) { + kinst_trampoline_deinit(); + return (error); + } + kinst_probetab = malloc(KINST_PROBETAB_MAX * + sizeof(struct kinst_probe_list), M_KINST, M_WAITOK | M_ZERO); + for (int i = 0; i < KINST_PROBETAB_MAX; i++) + LIST_INIT(&kinst_probetab[i]); + kinst_cdev = make_dev(&kinst_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "dtrace/kinst"); + dtrace_invop_add(kinst_invop); + return (0); +} + +static int +kinst_unload(void *dummy) +{ + free(kinst_probetab, M_KINST); + kinst_trampoline_deinit(); + dtrace_invop_remove(kinst_invop); + destroy_dev(kinst_cdev); + + return (dtrace_unregister(kinst_id)); +} + +static int +kinst_modevent(module_t mod __unused, int type, void *data __unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + KINST_LOG( + "kinst: This provider is experimental, exercise caution"); + break; + case MOD_UNLOAD: + break; + case MOD_SHUTDOWN: + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +SYSINIT(kinst_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_load, NULL); +SYSUNINIT(kinst_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_unload, + NULL); + +DEV_MODULE(kinst, kinst_modevent, NULL); +MODULE_VERSION(kinst, 1); +MODULE_DEPEND(kinst, dtrace, 1, 1, 1); +MODULE_DEPEND(kinst, opensolaris, 1, 1, 1); diff --git a/sys/cddl/dev/kinst/trampoline.c b/sys/cddl/dev/kinst/trampoline.c new file mode 100644 --- /dev/null +++ b/sys/cddl/dev/kinst/trampoline.c @@ -0,0 +1,303 @@ +/* + * SPDX-License-Identifier: CDDL 1.0 + * + * Copyright 2022 Christos Margiolis + * Copyright 2022 Mark Johnston + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "kinst.h" +#include "kinst_isa.h" + +/* + * We can have 4KB/32B = 128 trampolines per chunk. + */ +#define KINST_TRAMPS_PER_CHUNK (KINST_TRAMPCHUNK_SIZE / KINST_TRAMP_SIZE) +/* + * Set the object size to 2GB, since we know that the object will only ever be + * used to allocate pages in the range [KERNBASE, 0xfffffffffffff000]. + */ +#define KINST_VMOBJ_SIZE (VM_MAX_ADDRESS - KERNBASE) + +struct trampchunk { + TAILQ_ENTRY(trampchunk) next; + uint8_t *addr; + /* 0 -> allocated, 1 -> free */ + BITSET_DEFINE(, KINST_TRAMPS_PER_CHUNK) free; +}; + +static TAILQ_HEAD(, trampchunk) kinst_trampchunks = + TAILQ_HEAD_INITIALIZER(kinst_trampchunks); +static struct sx kinst_tramp_sx; +SX_SYSINIT(kinst_tramp_sx, &kinst_tramp_sx, "kinst tramp"); +static eventhandler_tag kinst_thread_ctor_handler; +static eventhandler_tag kinst_thread_dtor_handler; + +static struct trampchunk * +kinst_trampchunk_alloc(void) +{ + struct trampchunk *chunk; + vm_offset_t trampaddr; + int error __diagused; + + sx_assert(&kinst_tramp_sx, SX_XLOCKED); + + /* + * Allocate virtual memory for the trampoline chunk. The returned + * address is saved in "trampaddr". + * + * Setting "trampaddr" to KERNBASE causes vm_map_find() to return an + * address above KERNBASE, so this satisfies both requirements. + */ + trampaddr = KERNBASE; + error = vm_map_find(kernel_map, NULL, 0, &trampaddr, + KINST_TRAMPCHUNK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, + 0); + if (error != KERN_SUCCESS) { + KINST_LOG("trampoline chunk allocation failed: %d", error); + return (NULL); + } + + error = kmem_back(kernel_object, trampaddr, KINST_TRAMPCHUNK_SIZE, + M_WAITOK | M_EXEC); + KASSERT(error == KERN_SUCCESS, ("kmem_back failed: %d", error)); + + KINST_TRAMP_INIT((void *)trampaddr, KINST_TRAMPCHUNK_SIZE); + + /* Allocate a tracker for this chunk. */ + chunk = malloc(sizeof(*chunk), M_KINST, M_WAITOK); + chunk->addr = (void *)trampaddr; + BIT_FILL(KINST_TRAMPS_PER_CHUNK, &chunk->free); + + TAILQ_INSERT_HEAD(&kinst_trampchunks, chunk, next); + + return (chunk); +} + +static void +kinst_trampchunk_free(struct trampchunk *chunk) +{ + sx_assert(&kinst_tramp_sx, SX_XLOCKED); + + TAILQ_REMOVE(&kinst_trampchunks, chunk, next); + kmem_unback(kernel_object, (vm_offset_t)chunk->addr, + KINST_TRAMPCHUNK_SIZE); + (void)vm_map_remove(kernel_map, (vm_offset_t)chunk->addr, + (vm_offset_t)(chunk->addr + KINST_TRAMPCHUNK_SIZE)); + free(chunk, M_KINST); +} + +static uint8_t * +kinst_trampoline_alloc_locked(int how) +{ + struct trampchunk *chunk; + uint8_t *tramp; + int off; + + sx_assert(&kinst_tramp_sx, SX_XLOCKED); + + TAILQ_FOREACH(chunk, &kinst_trampchunks, next) { + /* All trampolines from this chunk are already allocated. */ + if ((off = BIT_FFS(KINST_TRAMPS_PER_CHUNK, &chunk->free)) == 0) + continue; + /* BIT_FFS() returns indices starting at 1 instead of 0. */ + off--; + break; + } + if (chunk == NULL) { + if ((how & M_NOWAIT) != 0) + return (NULL); + + /* + * We didn't find any free trampoline in the current list, + * allocate a new one. If that happens the provider will no + * longer be reliable, so try to warn the user. + */ + if ((chunk = kinst_trampchunk_alloc()) == NULL) { + static bool once = true; + + if (once) { + once = false; + KINST_LOG( + "kinst: failed to allocate trampoline, " + "probes may not fire"); + } + return (NULL); + } + off = 0; + } + BIT_CLR(KINST_TRAMPS_PER_CHUNK, off, &chunk->free); + tramp = chunk->addr + off * KINST_TRAMP_SIZE; + return (tramp); +} + +uint8_t * +kinst_trampoline_alloc(int how) +{ + uint8_t *tramp; + + sx_xlock(&kinst_tramp_sx); + tramp = kinst_trampoline_alloc_locked(how); + sx_xunlock(&kinst_tramp_sx); + return (tramp); +} + +static void +kinst_trampoline_dealloc_locked(uint8_t *tramp, bool freechunks) +{ + struct trampchunk *chunk; + int off; + + if (tramp == NULL) + return; + + TAILQ_FOREACH(chunk, &kinst_trampchunks, next) { + for (off = 0; off < KINST_TRAMPS_PER_CHUNK; off++) { + if (chunk->addr + off * KINST_TRAMP_SIZE == tramp) { + KINST_TRAMP_INIT(tramp, KINST_TRAMP_SIZE); + BIT_SET(KINST_TRAMPS_PER_CHUNK, off, + &chunk->free); + if (freechunks && + BIT_ISFULLSET(KINST_TRAMPS_PER_CHUNK, + &chunk->free)) + kinst_trampchunk_free(chunk); + return; + } + } + } + panic("%s: did not find trampoline chunk for %p", __func__, tramp); +} + +void +kinst_trampoline_dealloc(uint8_t *tramp) +{ + sx_xlock(&kinst_tramp_sx); + kinst_trampoline_dealloc_locked(tramp, true); + sx_xunlock(&kinst_tramp_sx); +} + +static void +kinst_thread_ctor(void *arg __unused, struct thread *td) +{ + td->t_kinst = kinst_trampoline_alloc(M_WAITOK); +} + +static void +kinst_thread_dtor(void *arg __unused, struct thread *td) +{ + void *tramp; + + tramp = td->t_kinst; + td->t_kinst = NULL; + + /* + * This assumes that the thread_dtor event permits sleeping, which + * appears to be true for the time being. + */ + kinst_trampoline_dealloc(tramp); +} + +int +kinst_trampoline_init(void) +{ + struct proc *p; + struct thread *td; + void *tramp; + int error; + + kinst_thread_ctor_handler = EVENTHANDLER_REGISTER(thread_ctor, + kinst_thread_ctor, NULL, EVENTHANDLER_PRI_ANY); + kinst_thread_dtor_handler = EVENTHANDLER_REGISTER(thread_dtor, + kinst_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); + + error = 0; + tramp = NULL; + + sx_slock(&allproc_lock); + sx_xlock(&kinst_tramp_sx); + FOREACH_PROC_IN_SYSTEM(p) { +retry: + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + if (td->t_kinst != NULL) + continue; + if (tramp == NULL) { + /* + * Try to allocate a trampoline without dropping + * the process lock. If all chunks are fully + * utilized, we must release the lock and try + * again. + */ + tramp = kinst_trampoline_alloc_locked(M_NOWAIT); + if (tramp == NULL) { + PROC_UNLOCK(p); + tramp = kinst_trampoline_alloc_locked( + M_WAITOK); + if (tramp == NULL) { + /* + * Let the unload handler clean + * up. + */ + error = ENOMEM; + goto out; + } else + goto retry; + } + } + td->t_kinst = tramp; + tramp = NULL; + } + PROC_UNLOCK(p); + } +out: + sx_xunlock(&kinst_tramp_sx); + sx_sunlock(&allproc_lock); + return (error); +} + +int +kinst_trampoline_deinit(void) +{ + struct trampchunk *chunk, *tmp; + struct proc *p; + struct thread *td; + + EVENTHANDLER_DEREGISTER(thread_ctor, kinst_thread_ctor_handler); + EVENTHANDLER_DEREGISTER(thread_dtor, kinst_thread_dtor_handler); + + sx_slock(&allproc_lock); + sx_xlock(&kinst_tramp_sx); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + kinst_trampoline_dealloc_locked(td->t_kinst, false); + td->t_kinst = NULL; + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + TAILQ_FOREACH_SAFE(chunk, &kinst_trampchunks, next, tmp) + kinst_trampchunk_free(chunk); + sx_xunlock(&kinst_tramp_sx); + + return (0); +} diff --git a/sys/modules/dtrace/Makefile b/sys/modules/dtrace/Makefile --- a/sys/modules/dtrace/Makefile +++ b/sys/modules/dtrace/Makefile @@ -18,6 +18,7 @@ .endif .if ${MACHINE_CPUARCH} == "amd64" SUBDIR+= systrace_linux32 +SUBDIR+= kinst .endif .if ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "aarch64" || \ diff --git a/sys/modules/dtrace/kinst/Makefile b/sys/modules/dtrace/kinst/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/dtrace/kinst/Makefile @@ -0,0 +1,17 @@ +SYSDIR?= ${SRCTOP}/sys + +.PATH: ${SYSDIR}/cddl/dev/kinst \ + ${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH} + +KMOD= kinst +SRCS= kinst.c kinst_isa.c trampoline.c + +CFLAGS+= ${OPENZFS_CFLAGS} \ + -I${SYSDIR}/cddl/dev/kinst \ + -I${SYSDIR}/cddl/dev/dtrace/x86 \ + -I${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH} + +.include + +CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h +CWARNFLAGS+= ${OPENZFS_CWARNFLAGS}