Page MenuHomeFreeBSD

D36851.id.diff
No OneTemporary

D36851.id.diff

diff --git a/sys/cddl/dev/dtrace/dtrace_cddl.h b/sys/cddl/dev/dtrace/dtrace_cddl.h
--- a/sys/cddl/dev/dtrace/dtrace_cddl.h
+++ b/sys/cddl/dev/dtrace/dtrace_cddl.h
@@ -88,6 +88,7 @@
void *td_systrace_args; /* syscall probe arguments. */
uint64_t td_fasttrap_tp_gen; /* Tracepoint hash table gen. */
struct trapframe *td_dtrace_trapframe; /* Trap frame from invop. */
+ void *td_kinst;
} kdtrace_thread_t;
/*
@@ -117,6 +118,7 @@
#define t_dtrace_systrace_args td_dtrace->td_systrace_args
#define t_fasttrap_tp_gen td_dtrace->td_fasttrap_tp_gen
#define t_dtrace_trapframe td_dtrace->td_dtrace_trapframe
+#define t_kinst td_dtrace->td_kinst
#define p_dtrace_helpers p_dtrace->p_dtrace_helpers
#define p_dtrace_count p_dtrace->p_dtrace_count
#define p_dtrace_probes p_dtrace->p_dtrace_probes
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.h b/sys/cddl/dev/kinst/amd64/kinst_isa.h
new file mode 100644
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
+ */
+
+#ifndef _KINST_ISA_H_
+#define _KINST_ISA_H_
+
+#include <sys/types.h>
+
+#define KINST_PATCHVAL 0xcc
+
+/*
+ * Each trampoline is 32 bytes long and contains [instruction, jmp]. Since we
+ * have 2 instructions stored in the trampoline, and each of them can take up
+ * to 16 bytes, 32 bytes is enough to cover even the worst case scenario.
+ */
+#define KINST_TRAMP_SIZE 32
+#define KINST_TRAMPCHUNK_SIZE PAGE_SIZE
+
+/*
+ * Fill the trampolines with breakpoint instructions so that the kernel will
+ * crash cleanly if things somehow go wrong.
+ */
+#define KINST_TRAMP_INIT(t, s) memset((t), KINST_PATCHVAL, (s))
+
+typedef uint8_t kinst_patchval_t;
+
+struct kinst_probe_md {
+ int flags;
+ int instlen; /* original instr len */
+ int tinstlen; /* trampoline instr len */
+ uint8_t template[16]; /* copied into thread tramps */
+ int dispoff; /* offset of rip displacement */
+
+ /* operands to "call" instruction branch target */
+ int reg1;
+ int reg2;
+ int scale;
+ int64_t disp;
+};
+
+#endif /* _KINST_ISA_H_ */
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.c b/sys/cddl/dev/kinst/amd64/kinst_isa.c
new file mode 100644
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.c
@@ -0,0 +1,550 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+
+#include <sys/dtrace.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <dis_tables.h>
+
+#include "kinst.h"
+
+#define KINST_PUSHL_RBP 0x55
+#define KINST_STI 0xfb
+#define KINST_POPF 0x9d
+
+#define KINST_MODRM_MOD(b) (((b) & 0xc0) >> 6)
+#define KINST_MODRM_REG(b) (((b) & 0x38) >> 3)
+#define KINST_MODRM_RM(b) ((b) & 0x07)
+
+#define KINST_SIB_SCALE(s) (((s) & 0xc0) >> 6)
+#define KINST_SIB_INDEX(s) (((s) & 0x38) >> 3)
+#define KINST_SIB_BASE(s) (((s) & 0x07) >> 0)
+
+#define KINST_REX_W(r) (((r) & 0x08) >> 3)
+#define KINST_REX_R(r) (((r) & 0x04) >> 2)
+#define KINST_REX_X(r) (((r) & 0x02) >> 1)
+#define KINST_REX_B(r) (((r) & 0x01) >> 0)
+
+#define KINST_F_CALL 0x0001 /* instruction is a "call" */
+#define KINST_F_DIRECT_CALL 0x0002 /* instruction is a direct call */
+#define KINST_F_RIPREL 0x0004 /* instruction is position-dependent */
+#define KINST_F_JMP 0x0008 /* instruction is a %rip-relative jmp */
+#define KINST_F_MOD_DIRECT 0x0010 /* operand is not a memory address */
+
+/*
+ * Map ModR/M register bits to a trapframe offset.
+ */
+static int
+kinst_regoff(int reg)
+{
+#define _MATCH_REG(i, reg) \
+ case i: \
+ return (offsetof(struct trapframe, tf_ ## reg) / \
+ sizeof(register_t))
+ switch (reg) {
+ _MATCH_REG( 0, rax);
+ _MATCH_REG( 1, rcx);
+ _MATCH_REG( 2, rdx);
+ _MATCH_REG( 3, rbx);
+ _MATCH_REG( 4, rsp); /* SIB when mod != 3 */
+ _MATCH_REG( 5, rbp);
+ _MATCH_REG( 6, rsi);
+ _MATCH_REG( 7, rdi);
+ _MATCH_REG( 8, r8); /* REX.R is set */
+ _MATCH_REG( 9, r9);
+ _MATCH_REG(10, r10);
+ _MATCH_REG(11, r11);
+ _MATCH_REG(12, r12);
+ _MATCH_REG(13, r13);
+ _MATCH_REG(14, r14);
+ _MATCH_REG(15, r15);
+ }
+#undef _MATCH_REG
+ panic("%s: unhandled register index %d", __func__, reg);
+}
+
+/*
+ * Obtain the specified register's value.
+ */
+static uint64_t
+kinst_regval(struct trapframe *frame, int reg)
+{
+ if (reg == -1)
+ return (0);
+ return (((register_t *)frame)[kinst_regoff(reg)]);
+}
+
+static uint32_t
+kinst_riprel_disp(struct kinst_probe *kp, void *dst)
+{
+ return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
+ (intptr_t)dst));
+}
+
+static void
+kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
+{
+ uint8_t *instr;
+ uint32_t disp;
+ int ilen;
+
+ ilen = kp->kp_md.tinstlen;
+
+ memcpy(tramp, kp->kp_md.template, ilen);
+ if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
+ disp = kinst_riprel_disp(kp, tramp);
+ memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
+ }
+
+ /*
+ * The following position-independent jmp takes us back to the
+ * original code. It is encoded as "jmp *0(%rip)" (six bytes),
+ * followed by the absolute address of the instruction following
+ * the one that was traced (eight bytes).
+ */
+ tramp[ilen + 0] = 0xff;
+ tramp[ilen + 1] = 0x25;
+ tramp[ilen + 2] = 0x00;
+ tramp[ilen + 3] = 0x00;
+ tramp[ilen + 4] = 0x00;
+ tramp[ilen + 5] = 0x00;
+ instr = kp->kp_patchpoint + kp->kp_md.instlen;
+ memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
+}
+
+int
+kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
+{
+ solaris_cpu_t *cpu;
+ uintptr_t *stack, retaddr;
+ struct kinst_probe *kp;
+ struct kinst_probe_md *kpmd;
+ uint8_t *tramp;
+
+ stack = (uintptr_t *)frame->tf_rsp;
+ cpu = &solaris_cpu[curcpu];
+
+ LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
+ if ((uintptr_t)kp->kp_patchpoint == addr)
+ break;
+ }
+ if (kp == NULL)
+ return (0);
+
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ cpu->cpu_dtrace_caller = stack[0];
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
+ cpu->cpu_dtrace_caller = 0;
+
+ kpmd = &kp->kp_md;
+ if ((kpmd->flags & KINST_F_CALL) != 0) {
+ /*
+ * dtrace_invop_start() reserves space on the stack to
+ * store the return address of the call instruction.
+ */
+ retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
+ *(uintptr_t *)scratch = retaddr;
+
+ if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
+ frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
+ kpmd->disp + kpmd->instlen);
+ } else {
+ register_t rval;
+
+ if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
+ /* rip-relative */
+ rval = frame->tf_rip - 1 + kpmd->instlen;
+ } else {
+ /* indirect */
+ rval = kinst_regval(frame, kpmd->reg1) +
+ (kinst_regval(frame, kpmd->reg2) <<
+ kpmd->scale);
+ }
+
+ if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
+ frame->tf_rip = rval + kpmd->disp;
+ } else {
+ frame->tf_rip =
+ *(uintptr_t *)(rval + kpmd->disp);
+ }
+ }
+ return (DTRACE_INVOP_CALL);
+ } else {
+ tramp = curthread->t_kinst;
+ if (tramp == NULL) {
+ /*
+ * A trampoline allocation failed, so this probe is
+ * effectively disabled. Restore the original
+ * instruction.
+ *
+ * We can't safely print anything here, but the
+ * trampoline allocator should have left a breadcrumb in
+ * the dmesg.
+ */
+ kinst_patch_tracepoint(kp, kp->kp_savedval);
+ frame->tf_rip = (register_t)kp->kp_patchpoint;
+ } else {
+ kinst_trampoline_populate(kp, tramp);
+ frame->tf_rip = (register_t)tramp;
+ }
+ return (DTRACE_INVOP_NOP);
+ }
+}
+
+void
+kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
+{
+ register_t reg;
+ int oldwp;
+
+ reg = intr_disable();
+ oldwp = disable_wp();
+ *kp->kp_patchpoint = val;
+ restore_wp(oldwp);
+ intr_restore(reg);
+}
+
+static void
+kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
+{
+ kp->kp_md.disp = (int64_t)(int8_t)byte;
+}
+
+static void
+kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
+{
+ int32_t disp32;
+
+ memcpy(&disp32, bytes, sizeof(disp32));
+ kp->kp_md.disp = (int64_t)disp32;
+}
+
+static int
+kinst_dis_get_byte(void *p)
+{
+ int ret;
+ uint8_t **instr = p;
+
+ ret = **instr;
+ (*instr)++;
+
+ return (ret);
+}
+
+/*
+ * Set up all of the state needed to faithfully execute a probed instruction.
+ *
+ * In the simple case, we copy the instruction unmodified to a per-thread
+ * trampoline, wherein it is followed by a jump back to the original code.
+ * - Instructions can have %rip as an operand:
+ * - with %rip-relative addressing encoded in ModR/M, or
+ * - implicitly as a part of the instruction definition (jmp, call).
+ * - Call instructions (which may be %rip-relative) need to push the correct
+ * return address onto the stack.
+ *
+ * Call instructions are simple enough to be emulated in software, so we simply
+ * do not use the trampoline mechanism in that case. kinst_invop() will compute
+ * the branch target using the address info computed here (register operands and
+ * displacement).
+ *
+ * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
+ * displacement; when populating the trampoline the displacement is adjusted to
+ * be relative to the trampoline address. Trampolines are always allocated
+ * above KERNBASE for this reason.
+ *
+ * For other %rip-relative operands (just jumps) we take the same approach.
+ * Instructions which specify an 8-bit displacement must be rewritten to use a
+ * 32-bit displacement.
+ */
+static int
+kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
+{
+ struct kinst_probe_md *kpmd;
+ dis86_t d86;
+ uint8_t *bytes, modrm, rex;
+ int dispoff, i, ilen, opcidx;
+
+ kpmd = &kp->kp_md;
+
+ d86.d86_data = instr;
+ d86.d86_get_byte = kinst_dis_get_byte;
+ d86.d86_check_func = NULL;
+ if (dtrace_disx86(&d86, SIZE64) != 0) {
+ KINST_LOG("failed to disassemble instruction at: %p", *instr);
+ return (EINVAL);
+ }
+ bytes = d86.d86_bytes;
+ kpmd->instlen = kpmd->tinstlen = d86.d86_len;
+
+ /*
+ * Skip over prefixes, save REX.
+ */
+ rex = 0;
+ for (i = 0; i < kpmd->instlen; i++) {
+ switch (bytes[i]) {
+ case 0xf0 ... 0xf3:
+ /* group 1 */
+ continue;
+ case 0x26:
+ case 0x2e:
+ case 0x36:
+ case 0x3e:
+ case 0x64:
+ case 0x65:
+ /* group 2 */
+ continue;
+ case 0x66:
+ /* group 3 */
+ continue;
+ case 0x67:
+ /* group 4 */
+ continue;
+ case 0x40 ... 0x4f:
+ /* REX */
+ rex = bytes[i];
+ continue;
+ }
+ break;
+ }
+ KASSERT(i < kpmd->instlen,
+ ("%s: failed to disassemble instruction at %p", __func__, bytes));
+ opcidx = i;
+
+ /*
+ * Identify instructions of interest by opcode: calls and jumps.
+ * Extract displacements.
+ */
+ dispoff = -1;
+ switch (bytes[opcidx]) {
+ case 0x0f:
+ switch (bytes[opcidx + 1]) {
+ case 0x80 ... 0x8f:
+ /* conditional jmp near */
+ kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+ dispoff = opcidx + 2;
+ kinst_set_disp32(kp, &bytes[dispoff]);
+ break;
+ }
+ break;
+ case 0xe3:
+ /*
+ * There is no straightforward way to translate this instruction
+ * to use a 32-bit displacement. Fortunately, it is rarely
+ * used.
+ */
+ return (EINVAL);
+ case 0x70 ... 0x7f:
+ /* conditional jmp short */
+ kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+ dispoff = opcidx + 1;
+ kinst_set_disp8(kp, bytes[dispoff]);
+ break;
+ case 0xe9:
+ /* unconditional jmp near */
+ kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+ dispoff = opcidx + 1;
+ kinst_set_disp32(kp, &bytes[dispoff]);
+ break;
+ case 0xeb:
+ /* unconditional jmp short */
+ kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+ dispoff = opcidx + 1;
+ kinst_set_disp8(kp, bytes[dispoff]);
+ break;
+ case 0xe8:
+ case 0x9a:
+ /* direct call */
+ kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
+ dispoff = opcidx + 1;
+ kinst_set_disp32(kp, &bytes[dispoff]);
+ break;
+ case 0xff:
+ KASSERT(d86.d86_got_modrm,
+ ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
+ switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
+ case 0x02:
+ case 0x03:
+ /* indirect call */
+ kpmd->flags |= KINST_F_CALL;
+ break;
+ case 0x04:
+ case 0x05:
+ /* indirect jump */
+ kpmd->flags |= KINST_F_JMP;
+ break;
+ }
+ }
+
+ /*
+ * If there's a ModR/M byte, we need to check it to see if the operand
+ * is %rip-relative, and rewrite the displacement if so. If not, we
+ * might still have to extract operand info if this is a call
+ * instruction.
+ */
+ if (d86.d86_got_modrm) {
+ uint8_t mod, rm, sib;
+
+ kpmd->reg1 = kpmd->reg2 = -1;
+
+ modrm = bytes[d86.d86_rmindex];
+ mod = KINST_MODRM_MOD(modrm);
+ rm = KINST_MODRM_RM(modrm);
+ if (mod == 0 && rm == 5) {
+ kpmd->flags |= KINST_F_RIPREL;
+ dispoff = d86.d86_rmindex + 1;
+ kinst_set_disp32(kp, &bytes[dispoff]);
+ } else if ((kpmd->flags & KINST_F_CALL) != 0) {
+ bool havesib;
+
+ havesib = (mod != 3 && rm == 4);
+ dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
+ if (mod == 1)
+ kinst_set_disp8(kp, bytes[dispoff]);
+ else if (mod == 2)
+ kinst_set_disp32(kp, &bytes[dispoff]);
+ else if (mod == 3)
+ kpmd->flags |= KINST_F_MOD_DIRECT;
+
+ if (havesib) {
+ sib = bytes[d86.d86_rmindex + 1];
+ if (KINST_SIB_BASE(sib) != 5) {
+ kpmd->reg1 = KINST_SIB_BASE(sib) |
+ (KINST_REX_B(rex) << 3);
+ }
+ kpmd->scale = KINST_SIB_SCALE(sib);
+ kpmd->reg2 = KINST_SIB_INDEX(sib) |
+ (KINST_REX_X(rex) << 3);
+ } else {
+ kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
+ }
+ }
+ }
+
+ /*
+ * Calls are emulated in software; once operands are decoded we have
+ * nothing else to do.
+ */
+ if ((kpmd->flags & KINST_F_CALL) != 0)
+ return (0);
+
+ /*
+ * Allocate and populate an instruction trampoline template.
+ *
+ * Position-independent instructions can simply be copied, but
+ * position-dependent instructions require some surgery: jump
+ * instructions with an 8-bit displacement need to be converted to use a
+ * 32-bit displacement, and the adjusted displacement needs to be
+ * computed.
+ */
+ ilen = kpmd->instlen;
+ if ((kpmd->flags & KINST_F_RIPREL) != 0) {
+ if ((kpmd->flags & KINST_F_JMP) == 0 ||
+ bytes[opcidx] == 0x0f ||
+ bytes[opcidx] == 0xe9 ||
+ bytes[opcidx] == 0xff) {
+ memcpy(kpmd->template, bytes, dispoff);
+ memcpy(&kpmd->template[dispoff + 4],
+ &bytes[dispoff + 4], ilen - (dispoff + 4));
+ kpmd->dispoff = dispoff;
+ } else if (bytes[opcidx] == 0xeb) {
+ memcpy(kpmd->template, bytes, opcidx);
+ kpmd->template[opcidx] = 0xe9;
+ kpmd->dispoff = opcidx + 1;
+
+ /* Instruction length changes from 2 to 5. */
+ kpmd->tinstlen = 5;
+ kpmd->disp -= 3;
+ } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f) {
+ memcpy(kpmd->template, bytes, opcidx);
+ kpmd->template[opcidx] = 0x0f;
+ kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
+ kpmd->dispoff = opcidx + 2;
+
+ /* Instruction length changes from 2 to 6. */
+ kpmd->tinstlen = 6;
+ kpmd->disp -= 4;
+ } else {
+ panic("unhandled opcode %#x", bytes[opcidx]);
+ }
+ } else {
+ memcpy(kpmd->template, bytes, ilen);
+ }
+
+ return (0);
+}
+
+int
+kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
+ void *opaque)
+{
+ struct kinst_probe *kp;
+ dtrace_kinst_probedesc_t *pd;
+ const char *func;
+ int error, n, off;
+ uint8_t *instr, *limit;
+
+ pd = opaque;
+ func = symval->name;
+ if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0)
+ return (0);
+
+ instr = (uint8_t *)symval->value;
+ limit = (uint8_t *)symval->value + symval->size;
+ if (instr >= limit)
+ return (0);
+
+ /*
+ * Ignore functions not beginning with the usual function prologue.
+ * These might correspond to assembly routines with which we should not
+ * meddle.
+ */
+ if (*instr != KINST_PUSHL_RBP)
+ return (0);
+
+ n = 0;
+ while (instr < limit) {
+ off = (int)(instr - (uint8_t *)symval->value);
+ if (pd->kpd_off != -1 && off != pd->kpd_off) {
+ instr += dtrace_instr_size(instr);
+ continue;
+ }
+
+ /*
+ * Prevent separate dtrace(1) instances from creating copies of
+ * the same probe.
+ */
+ LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
+ if (strcmp(kp->kp_func, func) == 0 &&
+ strtol(kp->kp_name, NULL, 10) == off)
+ return (0);
+ }
+ if (++n > KINST_PROBETAB_MAX) {
+ KINST_LOG("probe list full: %d entries", n);
+ return (ENOMEM);
+ }
+ kp = malloc(sizeof(struct kinst_probe), M_KINST,
+ M_WAITOK | M_ZERO);
+ kp->kp_func = func;
+ snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
+ kp->kp_savedval = *instr;
+ kp->kp_patchval = KINST_PATCHVAL;
+ kp->kp_patchpoint = instr;
+
+ error = kinst_instr_dissect(kp, &instr);
+ if (error != 0)
+ return (error);
+
+ kinst_probe_create(kp, lf);
+ }
+
+ return (0);
+}
diff --git a/sys/cddl/dev/kinst/kinst.h b/sys/cddl/dev/kinst/kinst.h
new file mode 100644
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.h
@@ -0,0 +1,71 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ */
+
+#ifndef _KINST_H_
+#define _KINST_H_
+
+#include <sys/dtrace.h>
+
+typedef struct {
+ char kpd_func[DTRACE_FUNCNAMELEN];
+ char kpd_mod[DTRACE_MODNAMELEN];
+ int kpd_off;
+} dtrace_kinst_probedesc_t;
+
+#define KINSTIOC_MAKEPROBE _IOW('k', 1, dtrace_kinst_probedesc_t)
+
+#ifdef _KERNEL
+
+#include <sys/queue.h>
+
+#include "kinst_isa.h"
+
+struct kinst_probe {
+ LIST_ENTRY(kinst_probe) kp_hashnext;
+ const char *kp_func;
+ char kp_name[16];
+ dtrace_id_t kp_id;
+ kinst_patchval_t kp_patchval;
+ kinst_patchval_t kp_savedval;
+ kinst_patchval_t *kp_patchpoint;
+
+ struct kinst_probe_md kp_md;
+};
+
+LIST_HEAD(kinst_probe_list, kinst_probe);
+
+extern struct kinst_probe_list *kinst_probetab;
+
+#define KINST_PROBETAB_MAX 0x8000 /* 32k */
+#define KINST_ADDR2NDX(addr) (((uintptr_t)(addr)) & (KINST_PROBETAB_MAX - 1))
+#define KINST_GETPROBE(i) (&kinst_probetab[KINST_ADDR2NDX(i)])
+
+struct linker_file;
+struct linker_symval;
+
+int kinst_invop(uintptr_t, struct trapframe *, uintptr_t);
+int kinst_make_probe(struct linker_file *, int, struct linker_symval *,
+ void *);
+void kinst_patch_tracepoint(struct kinst_probe *, kinst_patchval_t);
+void kinst_probe_create(struct kinst_probe *, struct linker_file *);
+
+int kinst_trampoline_init(void);
+int kinst_trampoline_deinit(void);
+uint8_t *kinst_trampoline_alloc(int);
+void kinst_trampoline_dealloc(uint8_t *);
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_KINST);
+#endif /* MALLOC_DECLARE */
+
+#define KINST_LOG_HELPER(fmt, ...) \
+ printf("%s:%d: " fmt "%s\n", __func__, __LINE__, __VA_ARGS__)
+#define KINST_LOG(...) \
+ KINST_LOG_HELPER(__VA_ARGS__, "")
+
+#endif /* _KERNEL */
+
+#endif /* _KINST_H_ */
diff --git a/sys/cddl/dev/kinst/kinst.c b/sys/cddl/dev/kinst/kinst.c
new file mode 100644
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.c
@@ -0,0 +1,233 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+
+#include "kinst.h"
+
+MALLOC_DEFINE(M_KINST, "kinst", "Kernel Instruction Tracing");
+
+static d_open_t kinst_open;
+static d_close_t kinst_close;
+static d_ioctl_t kinst_ioctl;
+
+static void kinst_provide_module(void *, modctl_t *);
+static void kinst_getargdesc(void *, dtrace_id_t, void *,
+ dtrace_argdesc_t *);
+static void kinst_destroy(void *, dtrace_id_t, void *);
+static void kinst_enable(void *, dtrace_id_t, void *);
+static void kinst_disable(void *, dtrace_id_t, void *);
+static int kinst_load(void *);
+static int kinst_unload(void *);
+static int kinst_modevent(module_t, int, void *);
+
+static dtrace_pattr_t kinst_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+};
+
+static dtrace_pops_t kinst_pops = {
+ .dtps_provide = NULL,
+ .dtps_provide_module = kinst_provide_module,
+ .dtps_enable = kinst_enable,
+ .dtps_disable = kinst_disable,
+ .dtps_suspend = NULL,
+ .dtps_resume = NULL,
+ .dtps_getargdesc = kinst_getargdesc,
+ .dtps_getargval = NULL,
+ .dtps_usermode = NULL,
+ .dtps_destroy = kinst_destroy
+};
+
+static struct cdevsw kinst_cdevsw = {
+ .d_name = "kinst",
+ .d_version = D_VERSION,
+ .d_flags = D_TRACKCLOSE,
+ .d_open = kinst_open,
+ .d_close = kinst_close,
+ .d_ioctl = kinst_ioctl,
+};
+
+static dtrace_provider_id_t kinst_id;
+struct kinst_probe_list *kinst_probetab;
+static struct cdev *kinst_cdev;
+
+void
+kinst_probe_create(struct kinst_probe *kp, linker_file_t lf)
+{
+ kp->kp_id = dtrace_probe_create(kinst_id, lf->filename,
+ kp->kp_func, kp->kp_name, 3, kp);
+
+ LIST_INSERT_HEAD(KINST_GETPROBE(kp->kp_patchpoint), kp, kp_hashnext);
+}
+
+static int
+kinst_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused,
+ struct thread *td __unused)
+{
+ return (0);
+}
+
+static int
+kinst_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
+ struct thread *td __unused)
+{
+ dtrace_condense(kinst_id);
+ return (0);
+}
+
+static int
+kinst_linker_file_cb(linker_file_t lf, void *arg)
+{
+ dtrace_kinst_probedesc_t *pd;
+
+ pd = arg;
+ if (pd->kpd_mod[0] != '\0' && strcmp(pd->kpd_mod, lf->filename) != 0)
+ return (0);
+
+ /*
+ * Invoke kinst_make_probe_function() once for each function symbol in
+ * the module "lf".
+ */
+ return (linker_file_function_listall(lf, kinst_make_probe, arg));
+}
+
+static int
+kinst_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr,
+ int flags __unused, struct thread *td __unused)
+{
+ dtrace_kinst_probedesc_t *pd;
+ int error = 0;
+
+ switch (cmd) {
+ case KINSTIOC_MAKEPROBE:
+ pd = (dtrace_kinst_probedesc_t *)addr;
+ pd->kpd_func[sizeof(pd->kpd_func) - 1] = '\0';
+ pd->kpd_mod[sizeof(pd->kpd_mod) - 1] = '\0';
+
+ /* Loop over all functions in the kernel and loaded modules. */
+ error = linker_file_foreach(kinst_linker_file_cb, pd);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ return (error);
+}
+
+static void
+kinst_provide_module(void *arg, modctl_t *lf)
+{
+}
+
+static void
+kinst_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
+{
+ desc->dtargd_ndx = DTRACE_ARGNONE;
+}
+
+static void
+kinst_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+ struct kinst_probe *kp = parg;
+
+ LIST_REMOVE(kp, kp_hashnext);
+ free(kp, M_KINST);
+}
+
+static void
+kinst_enable(void *arg, dtrace_id_t id, void *parg)
+{
+ struct kinst_probe *kp = parg;
+
+ kinst_patch_tracepoint(kp, kp->kp_patchval);
+}
+
+static void
+kinst_disable(void *arg, dtrace_id_t id, void *parg)
+{
+ struct kinst_probe *kp = parg;
+
+ kinst_patch_tracepoint(kp, kp->kp_savedval);
+}
+
+static int
+kinst_load(void *dummy)
+{
+ int error;
+
+ error = kinst_trampoline_init();
+ if (error != 0)
+ return (error);
+
+ error = dtrace_register("kinst", &kinst_attr, DTRACE_PRIV_USER, NULL,
+ &kinst_pops, NULL, &kinst_id);
+ if (error != 0) {
+ kinst_trampoline_deinit();
+ return (error);
+ }
+ kinst_probetab = malloc(KINST_PROBETAB_MAX *
+ sizeof(struct kinst_probe_list), M_KINST, M_WAITOK | M_ZERO);
+ for (int i = 0; i < KINST_PROBETAB_MAX; i++)
+ LIST_INIT(&kinst_probetab[i]);
+ kinst_cdev = make_dev(&kinst_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "dtrace/kinst");
+ dtrace_invop_add(kinst_invop);
+ return (0);
+}
+
+static int
+kinst_unload(void *dummy)
+{
+ free(kinst_probetab, M_KINST);
+ kinst_trampoline_deinit();
+ dtrace_invop_remove(kinst_invop);
+ destroy_dev(kinst_cdev);
+
+ return (dtrace_unregister(kinst_id));
+}
+
+static int
+kinst_modevent(module_t mod __unused, int type, void *data __unused)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ KINST_LOG(
+ "kinst: This provider is experimental, exercise caution");
+ break;
+ case MOD_UNLOAD:
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+SYSINIT(kinst_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_load, NULL);
+SYSUNINIT(kinst_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_unload,
+ NULL);
+
+DEV_MODULE(kinst, kinst_modevent, NULL);
+MODULE_VERSION(kinst, 1);
+MODULE_DEPEND(kinst, dtrace, 1, 1, 1);
+MODULE_DEPEND(kinst, opensolaris, 1, 1, 1);
diff --git a/sys/cddl/dev/kinst/trampoline.c b/sys/cddl/dev/kinst/trampoline.c
new file mode 100644
--- /dev/null
+++ b/sys/cddl/dev/kinst/trampoline.c
@@ -0,0 +1,303 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <christos@FreeBSD.org>
+ * Copyright 2022 Mark Johnston <markj@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bitset.h>
+#include <sys/cred.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sx.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+
+#include "kinst.h"
+#include "kinst_isa.h"
+
+/*
+ * We can have 4KB/32B = 128 trampolines per chunk.
+ */
+#define KINST_TRAMPS_PER_CHUNK (KINST_TRAMPCHUNK_SIZE / KINST_TRAMP_SIZE)
+/*
+ * Set the object size to 2GB, since we know that the object will only ever be
+ * used to allocate pages in the range [KERNBASE, 0xfffffffffffff000].
+ */
+#define KINST_VMOBJ_SIZE (VM_MAX_ADDRESS - KERNBASE)
+
+struct trampchunk {
+ TAILQ_ENTRY(trampchunk) next;
+ uint8_t *addr;
+ /* 0 -> allocated, 1 -> free */
+ BITSET_DEFINE(, KINST_TRAMPS_PER_CHUNK) free;
+};
+
+static TAILQ_HEAD(, trampchunk) kinst_trampchunks =
+ TAILQ_HEAD_INITIALIZER(kinst_trampchunks);
+static struct sx kinst_tramp_sx;
+SX_SYSINIT(kinst_tramp_sx, &kinst_tramp_sx, "kinst tramp");
+static eventhandler_tag kinst_thread_ctor_handler;
+static eventhandler_tag kinst_thread_dtor_handler;
+
+static struct trampchunk *
+kinst_trampchunk_alloc(void)
+{
+ struct trampchunk *chunk;
+ vm_offset_t trampaddr;
+ int error __diagused;
+
+ sx_assert(&kinst_tramp_sx, SX_XLOCKED);
+
+ /*
+ * Allocate virtual memory for the trampoline chunk. The returned
+ * address is saved in "trampaddr".
+ *
+ * Setting "trampaddr" to KERNBASE causes vm_map_find() to return an
+ * address above KERNBASE, so this satisfies both requirements.
+ */
+ trampaddr = KERNBASE;
+ error = vm_map_find(kernel_map, NULL, 0, &trampaddr,
+ KINST_TRAMPCHUNK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
+ 0);
+ if (error != KERN_SUCCESS) {
+ KINST_LOG("trampoline chunk allocation failed: %d", error);
+ return (NULL);
+ }
+
+ error = kmem_back(kernel_object, trampaddr, KINST_TRAMPCHUNK_SIZE,
+ M_WAITOK | M_EXEC);
+ KASSERT(error == KERN_SUCCESS, ("kmem_back failed: %d", error));
+
+ KINST_TRAMP_INIT((void *)trampaddr, KINST_TRAMPCHUNK_SIZE);
+
+ /* Allocate a tracker for this chunk. */
+ chunk = malloc(sizeof(*chunk), M_KINST, M_WAITOK);
+ chunk->addr = (void *)trampaddr;
+ BIT_FILL(KINST_TRAMPS_PER_CHUNK, &chunk->free);
+
+ TAILQ_INSERT_HEAD(&kinst_trampchunks, chunk, next);
+
+ return (chunk);
+}
+
+static void
+kinst_trampchunk_free(struct trampchunk *chunk)
+{
+ sx_assert(&kinst_tramp_sx, SX_XLOCKED);
+
+ TAILQ_REMOVE(&kinst_trampchunks, chunk, next);
+ kmem_unback(kernel_object, (vm_offset_t)chunk->addr,
+ KINST_TRAMPCHUNK_SIZE);
+ (void)vm_map_remove(kernel_map, (vm_offset_t)chunk->addr,
+ (vm_offset_t)(chunk->addr + KINST_TRAMPCHUNK_SIZE));
+ free(chunk, M_KINST);
+}
+
+static uint8_t *
+kinst_trampoline_alloc_locked(int how)
+{
+ struct trampchunk *chunk;
+ uint8_t *tramp;
+ int off;
+
+ sx_assert(&kinst_tramp_sx, SX_XLOCKED);
+
+ TAILQ_FOREACH(chunk, &kinst_trampchunks, next) {
+ /* All trampolines from this chunk are already allocated. */
+ if ((off = BIT_FFS(KINST_TRAMPS_PER_CHUNK, &chunk->free)) == 0)
+ continue;
+ /* BIT_FFS() returns indices starting at 1 instead of 0. */
+ off--;
+ break;
+ }
+ if (chunk == NULL) {
+ if ((how & M_NOWAIT) != 0)
+ return (NULL);
+
+ /*
+ * We didn't find any free trampoline in the current list,
+ * allocate a new one. If that happens the provider will no
+ * longer be reliable, so try to warn the user.
+ */
+ if ((chunk = kinst_trampchunk_alloc()) == NULL) {
+ static bool once = true;
+
+ if (once) {
+ once = false;
+ KINST_LOG(
+ "kinst: failed to allocate trampoline, "
+ "probes may not fire");
+ }
+ return (NULL);
+ }
+ off = 0;
+ }
+ BIT_CLR(KINST_TRAMPS_PER_CHUNK, off, &chunk->free);
+ tramp = chunk->addr + off * KINST_TRAMP_SIZE;
+ return (tramp);
+}
+
+uint8_t *
+kinst_trampoline_alloc(int how)
+{
+ uint8_t *tramp;
+
+ sx_xlock(&kinst_tramp_sx);
+ tramp = kinst_trampoline_alloc_locked(how);
+ sx_xunlock(&kinst_tramp_sx);
+ return (tramp);
+}
+
+static void
+kinst_trampoline_dealloc_locked(uint8_t *tramp, bool freechunks)
+{
+ struct trampchunk *chunk;
+ int off;
+
+ if (tramp == NULL)
+ return;
+
+ TAILQ_FOREACH(chunk, &kinst_trampchunks, next) {
+ for (off = 0; off < KINST_TRAMPS_PER_CHUNK; off++) {
+ if (chunk->addr + off * KINST_TRAMP_SIZE == tramp) {
+ KINST_TRAMP_INIT(tramp, KINST_TRAMP_SIZE);
+ BIT_SET(KINST_TRAMPS_PER_CHUNK, off,
+ &chunk->free);
+ if (freechunks &&
+ BIT_ISFULLSET(KINST_TRAMPS_PER_CHUNK,
+ &chunk->free))
+ kinst_trampchunk_free(chunk);
+ return;
+ }
+ }
+ }
+ panic("%s: did not find trampoline chunk for %p", __func__, tramp);
+}
+
+void
+kinst_trampoline_dealloc(uint8_t *tramp)
+{
+ sx_xlock(&kinst_tramp_sx);
+ kinst_trampoline_dealloc_locked(tramp, true);
+ sx_xunlock(&kinst_tramp_sx);
+}
+
+static void
+kinst_thread_ctor(void *arg __unused, struct thread *td)
+{
+ td->t_kinst = kinst_trampoline_alloc(M_WAITOK);
+}
+
+static void
+kinst_thread_dtor(void *arg __unused, struct thread *td)
+{
+ void *tramp;
+
+ tramp = td->t_kinst;
+ td->t_kinst = NULL;
+
+ /*
+ * This assumes that the thread_dtor event permits sleeping, which
+ * appears to be true for the time being.
+ */
+ kinst_trampoline_dealloc(tramp);
+}
+
+int
+kinst_trampoline_init(void)
+{
+ struct proc *p;
+ struct thread *td;
+ void *tramp;
+ int error;
+
+ kinst_thread_ctor_handler = EVENTHANDLER_REGISTER(thread_ctor,
+ kinst_thread_ctor, NULL, EVENTHANDLER_PRI_ANY);
+ kinst_thread_dtor_handler = EVENTHANDLER_REGISTER(thread_dtor,
+ kinst_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
+
+ error = 0;
+ tramp = NULL;
+
+ sx_slock(&allproc_lock);
+ sx_xlock(&kinst_tramp_sx);
+ FOREACH_PROC_IN_SYSTEM(p) {
+retry:
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->t_kinst != NULL)
+ continue;
+ if (tramp == NULL) {
+ /*
+ * Try to allocate a trampoline without dropping
+ * the process lock. If all chunks are fully
+ * utilized, we must release the lock and try
+ * again.
+ */
+ tramp = kinst_trampoline_alloc_locked(M_NOWAIT);
+ if (tramp == NULL) {
+ PROC_UNLOCK(p);
+ tramp = kinst_trampoline_alloc_locked(
+ M_WAITOK);
+ if (tramp == NULL) {
+ /*
+ * Let the unload handler clean
+ * up.
+ */
+ error = ENOMEM;
+ goto out;
+ } else
+ goto retry;
+ }
+ }
+ td->t_kinst = tramp;
+ tramp = NULL;
+ }
+ PROC_UNLOCK(p);
+ }
+out:
+ sx_xunlock(&kinst_tramp_sx);
+ sx_sunlock(&allproc_lock);
+ return (error);
+}
+
+int
+kinst_trampoline_deinit(void)
+{
+ struct trampchunk *chunk, *tmp;
+ struct proc *p;
+ struct thread *td;
+
+ EVENTHANDLER_DEREGISTER(thread_ctor, kinst_thread_ctor_handler);
+ EVENTHANDLER_DEREGISTER(thread_dtor, kinst_thread_dtor_handler);
+
+ sx_slock(&allproc_lock);
+ sx_xlock(&kinst_tramp_sx);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ kinst_trampoline_dealloc_locked(td->t_kinst, false);
+ td->t_kinst = NULL;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ TAILQ_FOREACH_SAFE(chunk, &kinst_trampchunks, next, tmp)
+ kinst_trampchunk_free(chunk);
+ sx_xunlock(&kinst_tramp_sx);
+
+ return (0);
+}
diff --git a/sys/modules/dtrace/Makefile b/sys/modules/dtrace/Makefile
--- a/sys/modules/dtrace/Makefile
+++ b/sys/modules/dtrace/Makefile
@@ -18,6 +18,7 @@
.endif
.if ${MACHINE_CPUARCH} == "amd64"
SUBDIR+= systrace_linux32
+SUBDIR+= kinst
.endif
.if ${MACHINE_CPUARCH} == "amd64" || \
${MACHINE_CPUARCH} == "aarch64" || \
diff --git a/sys/modules/dtrace/kinst/Makefile b/sys/modules/dtrace/kinst/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/dtrace/kinst/Makefile
@@ -0,0 +1,17 @@
+SYSDIR?= ${SRCTOP}/sys
+
+.PATH: ${SYSDIR}/cddl/dev/kinst \
+ ${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH}
+
+KMOD= kinst
+SRCS= kinst.c kinst_isa.c trampoline.c
+
+CFLAGS+= ${OPENZFS_CFLAGS} \
+ -I${SYSDIR}/cddl/dev/kinst \
+ -I${SYSDIR}/cddl/dev/dtrace/x86 \
+ -I${SYSDIR}/cddl/dev/kinst/${MACHINE_CPUARCH}
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
+CWARNFLAGS+= ${OPENZFS_CWARNFLAGS}

File Metadata

Mime Type
text/plain
Expires
Sun, Feb 23, 1:36 AM (1 h, 45 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16784987
Default Alt Text
D36851.id.diff (32 KB)

Event Timeline