diff --git a/share/man/man4/rescue.4 b/share/man/man4/rescue.4 new file mode 100644 --- /dev/null +++ b/share/man/man4/rescue.4 @@ -0,0 +1,138 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2023 Juniper Networks, Inc. +.\" +.Dd August 1, 2023 +.Dt RESCUE 4 +.Os +.Sh NAME +.Nm rescue +.Nd Boot a rescue kernel after a kernel panic +.Sh SYNOPSIS +To compile a rescue kernel into the kernel, place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "options RESCUE_SUPPORT" +.Cd "makeoptions RESCUE_EMBED=/path/to/rescue/kernel" +.Ed +.Pp +The rescue kernel must be compiled with the following options: +.Bd -ragged -offset indent +.Cd "options RESCUE" +.Ed +.Pp +.Sh DESCRIPTION +The +.Nm +mechanism provides a flexible mechanism to save a kernel core dump following a +panic. +Traditionally, the approach used by +.Fx +has been to designate a "dump device," typically the system's swap partition, to +which the kernel dumps memory after panicking and before rebooting. +Following a reboot, the +.Xr savecore 8 +program recovers the saved memory dump and stores it in the filesystem. +This approach has the downside of requiring a dedicated dump device, +complicating system provisioning. +.Pp +Rescue kernels take a different approach: when the kernel panics, it automatically +boots a +.Ql rescue +kernel which runs out of a reserved region in RAM; the rescue kernel can repair +and mount local filesystems, and can subsequently dump the contents of RAM +directly to a file. +In particular, rescue kernels implement +.Pa /dev/dumper , +from which a minidump can be read. +This approach is more flexible: the rescue kernel can be configured to perform +arbitrary system actions, and can even boot an interactive system. +Furthermore, it does not require any reservation of disk space. +The main downside is integration complexity: the rescue kernel must be compiled +directly into the main kernel, and its actions must be configured. +.Pp +One +.Nm +configuration is to embed a root filesystem into the rescue kernel image, +for example using the MD_ROOT kernel option implemented by +.Xr md 4 . +The root filesystem could be configured to mount a host filesystem and dump +system memory to a file before rebooting, for example with +.Bd -literal -offset indent +# fsck_ffs -fy /dev/gpt/rootfs +# mount /dev/gpt/rootfs /mnt +# dd if=/dev/dumper of=/mnt/var/crash/vmcore bs=1M +# umount /mnt +.Ed +.Pp +To configure a kernel to boot a rescue kernel upon a panic, build the kernel +with the +.Dv RESCUE_SUPPORT +option, and specify a rescue kernel to embed using the +.Dv RESCUE_EMBED +option. +The +.Va debug.rescue_minidump +.Xr loader 8 +tunable must be set to a non-zero value to boot the rescue kernel upon a kernel +panic. +.Sh IMPLEMENTATION NOTES +.Nm +is currently implemented only for the amd64 and arm64 platforms. +.Pp +When a kernel is configured to boot a rescue kernel upon a panic, it reserves +physically contiguous memory during boot. +This reservation is solely for use by the rescue kernel, preventing it from +overwriting memory used by the host kernel. +.Pp +Tunables set in the host kernel are inherited by the rescue kernel, with a +couple of special cases. +First, the rescue kernel always has +.Va kern.smp.disabled=1 +set. +Second, tunables in the host kernel prefixed by +.Va debug.rescue +are inherited directly by the rescue kernel. +For example, setting +.Va debug.rescue.vm.numa.disabled=1 +in the main kernel will cause +.Va vm.numa.disabled=1 +to be set in the rescue kernel. +.Pp +Use of a rescue kernel requires reserving a region of host memory. +Typically, 64-96MB of memory is sufficient. +The default amount reserved can be changed by setting the +.Va debug.rescue_memsize +tunable to the desired amount of memory, in bytes. +For example, setting +.Va debug.rescue_memsize=256M +will reserve 256MB of memory for the rescue kernel. +.Pp +When the rescue kernel is embedded using the +.Dv RESCUE_EMBED +option, the host kernel must copy the rescue kernel to a suitably aligned +portion of the rescue kernel's memory reservation. +Thus, system memory will contain two copies of the rescue kernel. +To avoid wasting memory, on amd64 the original copy of the rescue kernel will +be freed once the rescue kernel's memory reservation has been initialized. +This behavior can be disabled by setting +.Va debug.rescue_free_kernel=0 . +.Pp +Currently the +.Nm +mechanism assumes that the host and rescue kernels are built from the same +sources. +That is, there is no compatibility guarantee if the two kernels are built from +different revisions of the +.Fx +source code, even if doing so appears to work in practice. +.Sh SEE ALSO +.Xr md 4 , +.Xr arch 7 , +.Xr build 7 , +.Xr savecore 8 , +.Xr panic 9 +.Sh BUGS +amd64 systems using the LA57 extension, i.e., five-level page tables, are not +yet supported. diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -89,6 +89,28 @@ 0: hlt jmp 0b +#ifdef RESCUE_SUPPORT +/* + * void rescue_tramp(uint64_t ncr3, uintptr_t startaddr, uintptr_t itramp); + * + * Set ourselves up to jump to amd64's btext: + * - Start executing from the identity map instead of the kernel map. + * - Install the bootstrap page tables for the rescue kernel, emulating the + * loader's behaviour of mapping the kernel at KERNBASE and establishing an + * identity map. + * - Jump to "startaddr", the rescue kernel's entry point above KERNBASE. + */ + .globl rescue_itramp + .p2align 6 /* avoid straddling a page */ +ENTRY(rescue_tramp) + jmp *%rdx /* jump to rescue_itramp in the identity map */ +rescue_itramp: + movq %rdi, %cr3 + jmp *%rsi + int3 +END(rescue_tramp) +#endif /* RESCUE_SUPPORT */ + /* * void la57_trampoline(%rdi pml5) * diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -128,6 +128,9 @@ #include #include #include +#ifdef RESCUE +#include +#endif #include #include #include @@ -792,21 +795,8 @@ printf("\n"); } - switch (p->md_type) { - case EFI_MD_TYPE_CODE: - case EFI_MD_TYPE_DATA: - case EFI_MD_TYPE_BS_CODE: - case EFI_MD_TYPE_BS_DATA: - case EFI_MD_TYPE_FREE: - /* - * We're allowed to use any entry with these types. - */ - break; - default: - continue; - } - - if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, + if (efi_physmem_type(p->md_type) && + !add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, physmap, physmap_idx)) break; } @@ -1283,6 +1273,105 @@ return (*pde & PG_FRAME); } +#ifdef RESCUE +static vm_offset_t +preload_add_data(vm_offset_t dst, const void *src, uint32_t type, uint32_t size) +{ + uint32_t *data; + + data = (uint32_t *)dst; + *data++ = type; + *data++ = size; + memcpy_early(data, src, size); + return ((vm_offset_t)data + roundup2(size, sizeof(void *))); +} + +static vm_offset_t +preload_add_string(vm_offset_t dst, uint32_t type, const char *s) +{ + return (preload_add_data(dst, s, type, strlen(s) + 1)); +} + +static vm_offset_t +preload_add_u64(vm_offset_t dst, uint32_t type, uint64_t val) +{ + return (preload_add_data(dst, &val, type, sizeof(val))); +} + +static vm_offset_t +preload_add_smap(vm_offset_t dst, struct bios_smap *src, uint32_t size) +{ + return (preload_add_data(dst, (void *)src, + MODINFO_METADATA | MODINFOMD_SMAP, size)); +} + +static vm_offset_t +preload_add_efimap(vm_offset_t dst, struct efi_map_header *efihdr) +{ + uint32_t size; + + size = roundup2(sizeof(struct efi_map_header), 16) + + efihdr->memory_size; + return (preload_add_data(dst, efihdr, + MODINFO_METADATA | MODINFOMD_EFI_MAP, size)); +} + +static vm_offset_t +preload_add_efifb(vm_offset_t dst, struct efi_fb *efifb) +{ + return (preload_add_data(dst, efifb, + MODINFO_METADATA | MODINFOMD_EFI_FB, sizeof(struct efi_fb))); +} + +static vm_offset_t +preload_add_terminator(vm_offset_t dst) +{ + memset_early((void *)dst, 0, sizeof(uint32_t) * 2); + return (dst + sizeof(uint32_t) * 2); +} + +static void +rescue_preload_init(uint64_t *modulepp, uint64_t *physfreep) +{ + struct rescue_kernel_params *params; + vm_offset_t env, kernend, md, mdstart, off; + + params = (struct rescue_kernel_params *)KERNBASE; + + off = round_page((uintptr_t)&_end - KERNSTART) + kernphys; + + env = off; + memcpy_early((void *)env, (void *)params->kp_kenvstart, + params->kp_kenvlen); + off += round_page(params->kp_kenvlen); + + md = mdstart = off; + md = preload_add_string(md, MODINFO_NAME, "kernel"); + md = preload_add_string(md, MODINFO_TYPE, "elf kernel"); + md = preload_add_u64(md, MODINFO_ADDR, KERNSTART); + md = preload_add_u64(md, MODINFO_SIZE, (uintptr_t)&_end - KERNBASE); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_ENVP, + env - (kernphys - KERNLOAD)); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_HOWTO, + params->kp_boothowto); + if (params->kp_efimapstart != 0) + md = preload_add_efimap(md, (void *)params->kp_efimapstart); + else + md = preload_add_smap(md, (void *)params->kp_smapstart, + params->kp_smaplen); + if (params->kp_efifbaddr != 0) + md = preload_add_efifb(md, (void *)params->kp_efifbaddr); + kernend = md - mdstart + 3 * sizeof(uint64_t); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_KERNEND, kernend); + md = preload_add_terminator(md); + + rescue_dumper_init(¶ms->kp_dumpparams); + + *modulepp = (uintptr_t)mdstart - (kernphys - KERNLOAD); + *physfreep = round_page(md); +} +#endif + u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { @@ -1301,6 +1390,16 @@ physfree += kernphys; +#ifdef RESCUE + /* + * The rescue kernel runs without any module metadata. The panicked + * kernel could provide it, but some variables, like the size of the + * loaded rescue kernel, can't easily be determined there. So, fake it + * here. + */ + rescue_preload_init(&modulep, &physfree); +#endif + kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -50,6 +50,9 @@ #include #include #include +#ifdef RESCUE_SUPPORT +#include +#endif #include CTASSERT(sizeof(struct kerneldumpheader) == 512); @@ -269,6 +272,14 @@ } dumpsize += PAGE_SIZE; +#ifdef RESCUE_SUPPORT + if (do_rescue_minidump) { + rescue_kernel_exec(); + /* Shouldn't return here unless something goes very wrong. */ + return (ENXIO); + } +#endif + wdog_next = progress = dumpsize; dumpsys_pb_init(dumpsize); diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -330,6 +330,9 @@ int apic_id, cpu, domain, i; u_char mpbiosreason; + if (mp_ncpus == 1) + return (0); + amd64_mp_alloc_pcpu(); mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); diff --git a/sys/amd64/amd64/rescue_dumper.c b/sys/amd64/amd64/rescue_dumper.c new file mode 100644 --- /dev/null +++ b/sys/amd64/amd64/rescue_dumper.c @@ -0,0 +1,457 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020, 2023 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +enum dump_segs { + DUMP_SEG_MDHDR = 0, /* minidump header */ + DUMP_SEG_MSGBUF, /* kernel message buffer */ + DUMP_SEG_DUMP_AVAIL, /* physical address ranges */ + DUMP_SEG_BITMAP, /* vm_page_dump array */ + DUMP_SEG_PTPS, /* kernel page table pages */ + DUMP_SEG_PAGES, /* pages marked in vm_page_dump */ + DUMP_SEG_COUNT, +}; + +struct dump_seg { + vm_offset_t ds_addr; + vm_size_t ds_sz; +}; + +struct dump_softc { + struct minidumphdr *sc_mdhdr; + struct dump_seg sc_segs[DUMP_SEG_COUNT]; + vm_offset_t sc_kernpml4; + vm_offset_t sc_scratchkva; + char *sc_scratchbuf; + u_long sc_npages; + off_t sc_cursor; +}; + +FEATURE(rescue, "rescue kernel dumper"); + +static MALLOC_DEFINE(M_DUMPER, "dumper", "Rescue dumper structures"); + +static struct rescue_dump_params params; + +void +rescue_dumper_init(struct rescue_dump_params *p) +{ + memcpy(¶ms, p, sizeof(params)); +} + +static void * +map_scratch(struct dump_softc *sc, vm_paddr_t pa) +{ + vm_offset_t scratch; + + scratch = sc->sc_scratchkva; + pmap_kenter(scratch, pa); + pmap_invlpg(kernel_pmap, scratch); + return ((void *)scratch); +} + +static void +dump_seg_init(struct dump_seg *seg, vm_offset_t addr, vm_size_t sz) +{ + seg->ds_addr = addr; + seg->ds_sz = sz; +} + +static vm_offset_t +map_host_seg(vm_paddr_t pa, vm_size_t size) +{ + vm_offset_t va; + + size = round_page(size + (pa & PAGE_MASK)); + va = kva_alloc(size); + if (va != 0) { + for (vm_size_t off = 0; off < size; off += PAGE_SIZE) + pmap_kenter(va + off, (pa & ~PAGE_MASK) + off); + } + return (va + (pa & PAGE_MASK)); +} + +static void +unmap_host_seg(struct dump_seg *seg) +{ + vm_offset_t va; + vm_size_t off, size; + + va = seg->ds_addr; + if (va == 0) + return; + + size = round_page(seg->ds_sz + (va & PAGE_MASK)); + va &= ~PAGE_MASK; + for (off = 0; off < size; off += PAGE_SIZE) + pmap_kremove(va + off); + kva_free(va, size); +} + +static void +dumper_cdevpriv_dtr(void *arg) +{ + struct dump_softc *sc; + + sc = arg; + + free(sc->sc_scratchbuf, M_DUMPER); + if (sc->sc_scratchkva != 0) + kva_free(sc->sc_scratchkva, PAGE_SIZE); + if (sc->sc_kernpml4 != 0) + pmap_kremove(sc->sc_kernpml4); + + unmap_host_seg(&sc->sc_segs[DUMP_SEG_BITMAP]); + unmap_host_seg(&sc->sc_segs[DUMP_SEG_DUMP_AVAIL]); + unmap_host_seg(&sc->sc_segs[DUMP_SEG_MSGBUF]); + + free(sc->sc_mdhdr, M_DUMPER); + free(sc, M_DUMPER); +} + +CTASSERT(sizeof(struct minidumphdr) <= PAGE_SIZE); + +static int +dumper_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + struct dump_softc *sc; + struct minidumphdr *mdhdr; + uint64_t *bitmap; + vm_offset_t va; + u_long i; + int error; + + sc = malloc(sizeof(*sc), M_DUMPER, M_WAITOK | M_ZERO); + + /* + * The minidump header gets padded out to a full page. + */ + mdhdr = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + (void)strcpy(mdhdr->magic, MINIDUMP_MAGIC); + mdhdr->version = MINIDUMP_VERSION; + mdhdr->msgbufsize = round_page(params.dp_msgbufsz); + mdhdr->bitmapsize = round_page(params.dp_vmdumpsz); + mdhdr->pmapsize = howmany(params.dp_kernend - params.dp_kernstart, + NBPDP) * PAGE_SIZE; + mdhdr->kernbase = params.dp_kernstart; + mdhdr->dmapbase = params.dp_dmapmin; + mdhdr->dmapend = params.dp_dmapmax; + mdhdr->dumpavailsize = round_page(sizeof(dump_avail)); + sc->sc_mdhdr = mdhdr; + + dump_seg_init(&sc->sc_segs[DUMP_SEG_MDHDR], (vm_offset_t)mdhdr, + PAGE_SIZE); + + /* + * Map the root kernel page table page. It is not included in the dump, + * but is needed in order to walk the page tables so it might as well be + * statically mapped. + * + * Also allocate a page of KVA to map the rest of the kernel page table + * pages during walks. + */ + sc->sc_kernpml4 = map_host_seg(params.dp_kernpml4pa, PAGE_SIZE); + if (sc->sc_kernpml4 == 0) { + error = ENOMEM; + goto err; + } + sc->sc_scratchkva = kva_alloc(PAGE_SIZE); + if (sc->sc_scratchkva == 0) { + error = ENOMEM; + goto err; + } + + /* + * In some cases it is necessary to synthesize a fake page table page. + */ + sc->sc_scratchbuf = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + + /* + * Map segments of the host kernel that get included in the minidump. + */ + va = map_host_seg(params.dp_msgbufpa, mdhdr->msgbufsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_MSGBUF], va, mdhdr->msgbufsize); + + va = map_host_seg(params.dp_dumpavailpa, mdhdr->dumpavailsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_DUMP_AVAIL], va, + mdhdr->dumpavailsize); + + va = map_host_seg(params.dp_vmdumppa, mdhdr->bitmapsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_BITMAP], va, mdhdr->bitmapsize); + + /* + * Create a virtual dump segment for the kernel page tables and marked + * host pages. + */ + dump_seg_init(&sc->sc_segs[DUMP_SEG_PTPS], 0, mdhdr->pmapsize); + + sc->sc_npages = 0; + bitmap = (uint64_t *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + for (i = 0; i < mdhdr->bitmapsize / sizeof(uint64_t); i++) + sc->sc_npages += bitcount64(bitmap[i]); + dump_seg_init(&sc->sc_segs[DUMP_SEG_PAGES], 0, + sc->sc_npages * PAGE_SIZE); + + error = devfs_set_cdevpriv(sc, dumper_cdevpriv_dtr); + if (error != 0) + goto err; + + return (0); + +err: + dumper_cdevpriv_dtr(sc); + return (error); +} + +/* + * Map a host page directory page. + */ +static pd_entry_t * +map_pde(struct dump_softc *sc, pd_entry_t pde) +{ + return (map_scratch(sc, pde & PG_FRAME)); +} + +/* + * Return a host page table page mapping the specified virtual address. + */ +static void * +map_ptp(struct dump_softc *sc, vm_offset_t va) +{ + pml4_entry_t *pml4p; + pdp_entry_t *pdpp, pdp; + pt_entry_t *ptp; + + KASSERT((va & PDPMASK) == 0, ("%s: unaligned VA %#lx", __func__, va)); + + /* + * PML4 entries at or above VM_MIN_KERNEL_ADDRESS are always valid. + */ + pml4p = (pml4_entry_t *)sc->sc_kernpml4 + pmap_pml4e_index(va); + KASSERT((*pml4p & X86_PG_V) != 0, + ("%s: invalid PML4 entry %#lx for va %#lx", __func__, *pml4p, va)); + + pdpp = map_pde(sc, *pml4p); + pdp = atomic_load_64(&pdpp[pmap_pdpe_index(va)]); + if ((pdp & X86_PG_V) == 0) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + if ((pdp & X86_PG_PS) != 0) { + /* Dump a 1GB mapping using a fake PTP. */ + ptp = (pt_entry_t *)sc->sc_scratchbuf; + for (int i = 0; i < NPDPEPG; i++) + ptp[i] = pdp + (vm_paddr_t)i * NBPDR; + return (ptp); + } else { + return (map_pde(sc, pdp)); + } +} + +static int +dumper_read_seg(struct dump_softc *sc, enum dump_segs idx, struct dump_seg *seg, + off_t baseoff, struct uio *uio) +{ + off_t off; + int error; + + KASSERT(baseoff <= uio->uio_offset && + baseoff + seg->ds_sz > uio->uio_offset, + ("%s: invalid offset %#lx into seg at %#lx-%#lx", __func__, + uio->uio_offset, baseoff, baseoff + seg->ds_sz)); + + error = 0; + off = uio->uio_offset - baseoff; + switch (idx) { + case DUMP_SEG_MDHDR: + case DUMP_SEG_MSGBUF: + case DUMP_SEG_DUMP_AVAIL: + case DUMP_SEG_BITMAP: + /* Linear segments can simply be copied. */ + error = uiomove((char *)seg->ds_addr + off, seg->ds_sz - off, + uio); + break; + case DUMP_SEG_PTPS: + /* Dump leaf page table pages. */ + for (vm_offset_t va = + params.dp_kernstart + (off / PAGE_SIZE) * NBPDP; + va < params.dp_kernend; va += NBPDP) { + char *ptp; + + ptp = map_ptp(sc, va); + error = uiomove(ptp + (off & PAGE_MASK), + PAGE_SIZE - (off & PAGE_MASK), uio); + if (error != 0 || uio->uio_resid == 0) + break; + off = uio->uio_offset - baseoff; + } + break; + case DUMP_SEG_PAGES: { + struct bitset *bitset; + vm_paddr_t *avail, pa; + size_t bitsetsize; + off_t off1; + long bit; + + avail = (vm_paddr_t *)sc->sc_segs[DUMP_SEG_DUMP_AVAIL].ds_addr; + + /* Dump pages marked in the bitmap. This is non-destructive. */ + bitset = (struct bitset *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + bitsetsize = sc->sc_segs[DUMP_SEG_BITMAP].ds_sz; + off1 = 0; + BIT_FOREACH_ISSET(bitsetsize * NBBY, bit, bitset) { + char *page; + int i; + + if (off1 < off) { + off1 += PAGE_SIZE; + continue; + } + + for (i = 0; avail[i + 1] != 0; i += 2) { + int npages; + + npages = howmany(avail[i + 1], PAGE_SIZE) - + avail[i] / PAGE_SIZE; + if (bit < npages) { + pa = avail[i] + bit * PAGE_SIZE; + break; + } + bit -= npages; + } + if (avail[i + 1] == 0) + panic("failed to map bit %ld to a page", bit); + + page = map_scratch(sc, pa); + error = uiomove(page + (off % PAGE_SIZE), + PAGE_SIZE - (off % PAGE_SIZE), uio); + if (error != 0) + break; + if (uio->uio_resid == 0) + break; + off = off1 = uio->uio_offset - baseoff; + } + break; + } + default: + panic("%s: unknown segment index %d", __func__, idx); + } + + return (error); +} + +static int +dumper_read(struct cdev *dev, struct uio *uio, int flags) +{ + struct dump_softc *sc; + struct dump_seg *seg; + off_t baseoff, off; + int error, i; + + error = devfs_get_cdevpriv((void **)&sc); + if (error != 0) + return (error); + + off = uio->uio_offset; + if (off < 0) + return (EINVAL); + + /* Seeks are not supported. */ + if (off != sc->sc_cursor) + return (ESPIPE); + + for (baseoff = 0, i = 0; i < DUMP_SEG_COUNT; i++) { + seg = &sc->sc_segs[i]; + if (off >= baseoff && off < baseoff + seg->ds_sz) { + error = dumper_read_seg(sc, i, seg, baseoff, uio); + break; + } + baseoff += seg->ds_sz; + MPASS((baseoff & PAGE_MASK) == 0); + } + + sc->sc_cursor = uio->uio_offset; + return (error); +} + +static struct cdevsw dumper_cdevsw = { + .d_version = D_VERSION, + .d_open = dumper_open, + .d_read = dumper_read, + .d_name = "dumper", +}; + +static int +dumper_modevent(module_t mod __unused, int type, void *data __unused) +{ + static struct cdev *dumper_dev; + + switch (type) { + case MOD_LOAD: + dumper_dev = make_dev(&dumper_cdevsw, 0, UID_ROOT, GID_WHEEL, + 0600, "dumper"); + break; + case MOD_UNLOAD: + destroy_dev(dumper_dev); + break; + } + return (0); +} +DEV_MODULE(dumper, dumper_modevent, NULL); +MODULE_VERSION(dumper, 1); diff --git a/sys/amd64/amd64/rescue_machdep.c b/sys/amd64/amd64/rescue_machdep.c new file mode 100644 --- /dev/null +++ b/sys/amd64/amd64/rescue_machdep.c @@ -0,0 +1,708 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +bool do_rescue_minidump = false; + +/* + * Parameters for memory reserved for the rescue kernel. The boundary and + * alignment are fixed by the requirements of locore. The size is configurable + * but of course must be satisfiable by an allocation with the defined alignment + * and boundary requirements. + * + * rescue_kernel_exec() also assumes that the reserved region can be mapped + * using a single PDP entry. + */ +#define RESCUE_RESERV_ALIGN (2 * 1024 * 1024u) /* 2MB */ +#define RESCUE_RESERV_BOUNDARY (1024 * 1024 * 1024u) /* 1GB */ +#define RESCUE_RESERV_DEFAULT_SIZE (128 * 1024 * 1024u) /* 128MB */ + +/* + * Environment variables beginning with this prefix are copied into the rescue + * kernel's environment with the prefix stripped. + */ +#define RESCUE_KENV_PREFIX "debug.rescue." +#define RESCUE_KENV_ENABLED "debug.rescue_minidump" +#define RESCUE_KENV_MEMSIZE "debug.rescue_memsize" + +static void *rescue_va; +static vm_paddr_t rescue_pa; +static vm_size_t rescue_memsize; + +/* + * Called from the host kernel at panic time to populate rescue dumper + * parameters. The returned structure is passed to the rescue kernel. + */ +static void +rescue_dump_params_init(struct rescue_dump_params *rdp) +{ + /* XXX-MJ currently does not handle 5-level page tables */ + rdp->dp_msgbufpa = vtophys(msgbufp->msg_ptr); + rdp->dp_msgbufsz = msgbufp->msg_size; + rdp->dp_vmdumppa = vtophys(vm_page_dump); + rdp->dp_vmdumpsz = BITSET_SIZE(vm_page_dump_pages); + rdp->dp_dumpavailpa = vtophys(dump_avail); + rdp->dp_kernpml4pa = vtophys(kernel_pmap->pm_pmltop); + rdp->dp_kernstart = VM_MIN_KERNEL_ADDRESS; + rdp->dp_kernend = MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end); + rdp->dp_kernmax = VM_MAX_KERNEL_ADDRESS; + rdp->dp_dmapmin = DMAP_MIN_ADDRESS; + rdp->dp_dmapmax = DMAP_MAX_ADDRESS; +} + +static void +rescue_kernel_cpu_switch(void) +{ + struct pcpu *pcpu; + + pcpu = cpuid_to_pcpu[0]; + if (get_pcpu() != pcpu) { + cpustop_restartfunc = rescue_kernel_exec; + atomic_thread_fence_seq_cst(); + CPU_SET_ATOMIC(0, &started_cpus); + for (;;) + cpu_spinwait(); + } +} + +#define NRESCUEPTP (16) +static uint64_t *rescue_ptps = NULL; +static size_t rescue_ptps_offset = 0; + +/* + * A bump allocator for bootstrap page table pages. This uses the rescue + * reservation since locore/hammer_time() might interrogate the PTPs to + * determine where the kernel is loaded. In particular, the PTPs must be + * mapped by rescue_kernel_exec(). + */ +static uint64_t * +rescue_kernel_alloc_ptp(void) +{ + uint64_t *ptp; + + if (rescue_ptps_offset * sizeof(uint64_t) >= NRESCUEPTP * PAGE_SIZE) { + /* Should only happen due to a programming error. */ + panic("rescue: ran out of bootstrap PTPs"); + } + ptp = rescue_ptps + rescue_ptps_offset; + rescue_ptps_offset += NPTEPG; + return (ptp); +} + +extern void rescue_tramp(uint64_t ncr3, uintptr_t start, uintptr_t itramp); +extern uintptr_t rescue_itramp; + +/* + * Set up initial page tables for the rescue kernel. These need to satisfy both + * amd64 locore and the remainder of rescue_kernel_exec(). So, we map: + * - the rescue reservation using an identity map, + * - the current thread's stack, + * - everything that's currently mapped above KERNBASE. + * + * The physical address of the new root PTP is returned. + */ +static int +rescue_kernel_exec_cr3(uint64_t *cr3p) +{ + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + pt_entry_t *pt; + vm_offset_t kstack; + vm_paddr_t tramppa; + size_t kstacksz; + + pml4 = rescue_kernel_alloc_ptp(); + + /* + * First build the identity map for the reservation using 2MB pages. + */ + pdp = rescue_kernel_alloc_ptp(); + pml4[pmap_pml4e_index(rescue_pa)] = X86_PG_RW | X86_PG_V | vtophys(pdp); + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(rescue_pa)] = X86_PG_RW | X86_PG_V | vtophys(pd); + for (vm_paddr_t pa = rescue_pa; pa < rescue_pa + rescue_memsize; + pa += NBPDR) + pd[pmap_pde_index(pa)] = X86_PG_PS | X86_PG_RW | X86_PG_V | pa; + + /* + * Extend the identity map to include the host kernel's rescue_tramp(). + * rescue_tramp() is aligned such that it will not cross a page + * boundary. + */ + tramppa = trunc_2mpage(vtophys(&rescue_tramp)); + if (pml4[pmap_pml4e_index(tramppa)] == 0) { + printf("rescue: rescue_tramp() is not mapped by PML4 page\n"); + return (EDOOFUS); + } + if (pdp[pmap_pdpe_index(tramppa)] == 0) { + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(tramppa)] = X86_PG_A | X86_PG_V | + vtophys(pd); + } + if (pd[pmap_pde_index(tramppa)] == 0) { + pd[pmap_pde_index(tramppa)] = X86_PG_PS | X86_PG_A | X86_PG_V | + tramppa; + } + + /* + * Identity-map the low 4KB of RAM for the benefit of the BIOS. + */ + if (pml4[pmap_pml4e_index(0)] == 0) { + pdp = rescue_kernel_alloc_ptp(); + pml4[pmap_pml4e_index(0)] = X86_PG_RW | X86_PG_V | + vtophys(pdp); + } else { + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4[pmap_pml4e_index(0)] & + PG_FRAME); + } + if (pdp[pmap_pdpe_index(0)] == 0) { + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(0)] = X86_PG_RW | X86_PG_V | vtophys(pd); + } else { + pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pmap_pdpe_index(0)] & + PG_FRAME); + } + if (pd[pmap_pde_index(0)] != 0) { + printf("rescue: low 2MB already occupied by reservation\n"); + return (EDOOFUS); + } + pt = rescue_kernel_alloc_ptp(); + pd[pmap_pde_index(0)] = X86_PG_RW | X86_PG_V | vtophys(pt); + pt[pmap_pte_index(0)] = X86_PG_RW | X86_PG_V; + + /* + * Map the rescue kernel at KERNSTART. + */ + if (pml4[pmap_pml4e_index(KERNBASE)] != 0) { + printf("rescue: kernel already mapped by identity map\n"); + return (EDOOFUS); + } + pdp = rescue_kernel_alloc_ptp(); + pml4[pmap_pml4e_index(KERNBASE)] = X86_PG_RW | X86_PG_V | vtophys(pdp); + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(KERNBASE)] = X86_PG_RW | X86_PG_V | vtophys(pd); + for (vm_offset_t va = KERNBASE; va < KERNBASE + rescue_memsize; + va += NBPDR) { + pd[pmap_pde_index(va)] = X86_PG_PS | X86_PG_RW | X86_PG_V | + (rescue_pa + (va - KERNBASE)); + } + + /* + * Finally, map the current stack. For now we assume that the stack + * doesn't span multiple PDEs. This is generally true, but a more + * complete implementation could handle that possibility. + */ + kstack = curthread->td_kstack; + kstacksz = curthread->td_kstack_pages * PAGE_SIZE; + if (pml4[pmap_pml4e_index(kstack)] != 0) { + printf("rescue: kernel stack already mapped by identity map\n"); + return (ENXIO); + } + if (pmap_pdpe_index(kstack) != pmap_pdpe_index(kstack + kstacksz - 1)) { + printf("rescue: kernel stack spans multiple PDP pages\n"); + return (ENXIO); + } + if (pmap_pde_index(kstack) != pmap_pde_index(kstack + kstacksz - 1)) { + printf("rescue: kernel stack spans multiple PD pages\n"); + return (ENXIO); + } + pdp = rescue_kernel_alloc_ptp(); + pml4[pmap_pml4e_index(kstack)] = X86_PG_RW | X86_PG_V | vtophys(pdp) | + pg_nx; + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(kstack)] = X86_PG_RW | X86_PG_V | vtophys(pd) | + pg_nx; + pt = rescue_kernel_alloc_ptp(); + pd[pmap_pde_index(kstack)] = X86_PG_RW | X86_PG_V | vtophys(pt) | + pg_nx; + for (vm_offset_t va = kstack; va < kstack + kstacksz; va += PAGE_SIZE) { + pt[pmap_pte_index(va)] = X86_PG_RW | X86_PG_V | pg_nx | + X86_PG_A | X86_PG_M | vtophys(va); + } + + *cr3p = vtophys(pml4); + return (0); +} + +void +rescue_kernel_exec(void) +{ + pml4_entry_t *opml4, *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + pt_entry_t *pt; + struct rescue_kernel_params *params; + uintptr_t entry, itramp, tramp; + Elf64_Ehdr *ehdr; + uint64_t cr3, ocr3; + + KASSERT((read_rflags() & PSL_I) == 0, + ("%s: interrupts enabled", __func__)); + + /* + * Switch to the boot CPU if we are not already on it. + */ + rescue_kernel_cpu_switch(); + + printf("rescue: preparing to exec rescue kernel\n"); + + intr_rescue_exec(); + + /* + * Prepare the dump parameters structure for the rescue kernel. The + * rest of the parameters must already have been initialized. These + * will be accessed via an aliasing mapping, so make sure the cache is + * written back. + */ + params = rescue_va; + rescue_dump_params_init(¶ms->kp_dumpparams); + + ehdr = (Elf64_Ehdr *)((char *)rescue_va + RESCUE_RESERV_KERNEL_OFFSET); + if (ehdr->e_ident[0] != ELFMAG0 || ehdr->e_ident[1] != ELFMAG1 || + ehdr->e_ident[2] != ELFMAG2 || ehdr->e_ident[3] != ELFMAG3) { + printf("rescue: rescue kernel is not an ELF file\n"); + return; + } + entry = ehdr->e_entry; + + if (rescue_kernel_exec_cr3(&cr3) != 0) { + printf("rescue: failed to initialize bootstrap page tables\n"); + return; + } + + tramp = trunc_page(vtophys(&rescue_tramp)); + itramp = vtophys(&rescue_itramp); + + /* + * amd64 locore expects to be executed via a mapping at KERNSTART. + * However, the current (panicked) kernel is already mapped there. So, + * we use a trampoline which can be executed via an identity map; the + * trampoline installs the rescue kernel's bootstrap root PML4 page + * before jumping to its entry point. + * + * For this to work, the trampoline must be identity-mapped in both the + * old and new kernels. rescue_kernel_exec_cr3() takes care of this for + * the rescue kernel. For the old kernel, we make a copy of the + * current PML4P (to avoid modifying host memory which might be relevant + * to a debugging session), then install it as the root PTP until the + * trampoline swaps in its own root PTP. + */ + ocr3 = rcr3() & ~(CR3_PCID_MASK | CR3_PCID_SAVE); + opml4 = (pml4_entry_t *)PHYS_TO_DMAP(ocr3); + pml4 = rescue_kernel_alloc_ptp(); + /* Just copy everything in the top half of the address space. */ + for (vm_pindex_t i = PML4PML4I; i < NPML4EPG; i++) + pml4[i] = opml4[i]; + pdp = rescue_kernel_alloc_ptp(); + pml4[pmap_pml4e_index(tramp)] = X86_PG_A | X86_PG_V | vtophys(pdp); + pd = rescue_kernel_alloc_ptp(); + pdp[pmap_pdpe_index(tramp)] = X86_PG_A | X86_PG_V | vtophys(pd); + pt = rescue_kernel_alloc_ptp(); + pd[pmap_pde_index(tramp)] = X86_PG_A | X86_PG_V | vtophys(pt); + pt[pmap_pte_index(tramp)] = X86_PG_A | X86_PG_V | tramp; + load_cr4(rcr4() & ~(CR4_PCIDE | CR4_PGE)); + load_cr3(vtophys(pml4)); + + rescue_tramp(cr3, entry, itramp); +} + +/* + * Dummy function to satisfy the dumper interface. This should never be + * called. + */ +static int +rescue_dumper_dummy(void *priv, void *virtual, off_t offset, size_t length) +{ + printf("%s: unexpected call\n", __func__); + return (EOPNOTSUPP); +} + +/* + * Copy a buffer into the rescue kernel's memory reservation at the specified + * offset. Returns an error if the copy would overflow the reservation buffer. + */ +static int +rescue_memcpy(vm_offset_t off, const void *src, size_t size, vm_offset_t *offp) +{ + if (off >= rescue_memsize || off + size > rescue_memsize) + return (1); + + memcpy((char *)rescue_va + off, src, size); + if (offp != NULL) + *offp = off + size; + return (0); +} + +/* + * Memset a region of the rescue kernel's memory reservation, with overflow + * checking. + */ +static int +rescue_memset(vm_offset_t off, char c, size_t size) +{ + if (off >= rescue_memsize || off + size > rescue_memsize) + return (1); + + memset((char *)rescue_va + off, c, size); + return (0); +} + +static size_t +rescue_kernel_init_efimap(const struct efi_map_header *srchdr, vm_offset_t off, + unsigned long memsize) +{ + struct efi_map_header hdr; + const struct efi_md *srcmd; + vm_offset_t start, end; + const size_t hdrsz = roundup2(sizeof(struct efi_map_header), 16); + int ndesc; + + start = end = off; + + memcpy(&hdr, srchdr, sizeof(hdr)); + end += hdrsz; + + /* + * Copy the memory map, excluding RAM entries that do not overlap with + * the rescue reservation. + */ + srcmd = (const struct efi_md *)((const uint8_t *)srchdr + hdrsz); + ndesc = srchdr->memory_size / srchdr->descriptor_size; + for (int i = 0; i < ndesc; i++) { + if (efi_physmem_type(srcmd->md_type)) { + if (srcmd->md_phys == 0) { + if (rescue_memcpy(end, srcmd, + srchdr->descriptor_size, &end)) + return (0); + } else if (srcmd->md_phys <= rescue_pa && + srcmd->md_phys + ptoa(srcmd->md_pages) >= + rescue_pa) { + struct efi_md *dstmd; + + dstmd = malloc(srchdr->descriptor_size, M_TEMP, + M_WAITOK); + memcpy(dstmd, srcmd, srchdr->descriptor_size); + dstmd->md_phys = rescue_pa; + dstmd->md_pages = atop(memsize); + bool err = rescue_memcpy(end, dstmd, + srchdr->descriptor_size, &end); + free(dstmd, M_TEMP); + if (err) + return (0); + } + } else if (rescue_memcpy(end, srcmd, srchdr->descriptor_size, + &end)) { + return (0); + } + srcmd = efi_next_descriptor(__DECONST(void *, srcmd), + srchdr->descriptor_size); + } + hdr.memory_size = end - start - sizeof(hdr); + if (rescue_memcpy(start, &hdr, sizeof(hdr), NULL)) + return (0); + + return (end - start); +} + +/* + * Initialize the rescue kernel's staging area: + * 1. Allocate the staging area. + * 2. Stash kernel metadata (the memory map, loader tunables) at the beginning + * of the staging area. + * 3. Copy the rescue kernel into the staging area. + * 4. Optionally free pages backing the original copy of the kernel, since they + * are no longer needed. + */ +static void +rescue_kernel_init(void *arg __unused) +{ + extern u_long rescue_start, rescue_end; + struct dumperinfo di; + struct diocskerneldump_arg kda; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + const struct efi_map_header *srchdr; + struct rescue_kernel_params *params; + const char *p; + caddr_t kmdp; + size_t kernlen, varlen; + vm_offset_t envstart, off; + unsigned long memsize; + int enabled, error, freeorig; + + enabled = 0; + TUNABLE_INT_FETCH(RESCUE_KENV_ENABLED, &enabled); + if (!enabled) + return; + if (!do_minidump) { + printf("rescue: minidumps are not enabled\n"); + return; + } + + kernlen = (u_long)&rescue_end - (u_long)&rescue_start; + + /* + * Figure how much memory we need to allocate. We allocate free memory + * for the rescue kernel, memory to hold the rescue kernel image, and + * 2MB for the environment and metadata, and for bootstrap page table + * pages. + */ + memsize = RESCUE_RESERV_DEFAULT_SIZE; + TUNABLE_ULONG_FETCH(RESCUE_KENV_MEMSIZE, &memsize); + memsize += round_page(kernlen); + memsize += NBPDR; + + /* + * Require memory below the 4GB boundary both for the benefit of devices + * with limited DMA addressing capabilities, and because the amd64 + * kernel assumes that it is loaded below 4GB. See amd64_loadaddr(), + * for example. + */ + rescue_va = kmem_alloc_contig(memsize, M_NOWAIT | M_ZERO | M_NODUMP, + 0, (vm_paddr_t)1 << 32, RESCUE_RESERV_ALIGN, RESCUE_RESERV_BOUNDARY, + VM_MEMATTR_DEFAULT); + if (rescue_va == NULL) { + printf("rescue: failed to reserve contiguous memory\n"); + goto out; + } + rescue_pa = pmap_kextract((vm_offset_t)rescue_va); + rescue_memsize = memsize; + + params = rescue_va; + off = roundup2(sizeof(*params), sizeof(void *)); + params->kp_boothowto = boothowto; + + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + srchdr = (const struct efi_map_header *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_EFI_MAP); + if (srchdr != NULL) { + const struct efi_fb *efifb; + size_t efimaplen; + + efimaplen = rescue_kernel_init_efimap(srchdr, off, memsize); + if (efimaplen == 0) { + printf("rescue: failed to copy EFI memory map\n"); + goto out; + } + params->kp_efimapstart = rescue_pa + off; + params->kp_efimaplen = efimaplen; + off += efimaplen; + + efifb = (const struct efi_fb *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_EFI_FB); + if (efifb != NULL) { + params->kp_efifbaddr = rescue_pa + off; + if (rescue_memcpy(off, efifb, sizeof(*efifb), &off)) { + printf( + "rescue: failed to copy EFI framebuffer\n"); + goto out; + } + } + } else { + struct bios_smap smap; + size_t smaplen; + + smaplen = sizeof(struct bios_smap) + sizeof(uint32_t); + smap.base = rescue_pa; + smap.length = memsize; + smap.type = SMAP_TYPE_MEMORY; + + params->kp_smapstart = rescue_pa + off; + params->kp_smaplen = smaplen; + if (rescue_memcpy(off, &smap, sizeof(smap), &off)) { + printf("rescue: failed to copy BIOS memory map\n"); + goto out; + } + } + + /* + * Copy the host kernel's environment, with three differences: + * 1. SMP is disabled. + * 2. debug.rescue_minidump=1 from the host is omitted. + * 3. Any tunables prefixed by debug.rescue are copied without the + * prefix. This provides a mechanism to override host tunables + * if needed. Prefixed tunables are copied first since tunable + * lookups are first-match. + */ + envstart = off; + p = "kern.smp.disabled=1"; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + for (int i = 0; kenvp[i] != NULL; i++) { + p = kenvp[i]; + if (strncmp(p, RESCUE_KENV_PREFIX, + sizeof(RESCUE_KENV_PREFIX) - 1) != 0) + continue; + p += sizeof(RESCUE_KENV_PREFIX) - 1; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + } + for (int i = 0; kenvp[i] != NULL; i++) { + p = kenvp[i]; + if (strncmp(p, RESCUE_KENV_PREFIX, + sizeof(RESCUE_KENV_PREFIX) - 1) == 0) + continue; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + } + p = "\0"; + if (rescue_memcpy(off, p, 1, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + params->kp_kenvstart = rescue_pa + envstart; + params->kp_kenvlen = off - envstart; + + /* + * Finally, reserve some space for the bootstrap page table pages. + */ + off = round_page(off); + rescue_ptps = (uint64_t *)((uintptr_t)rescue_va + off); + off += NRESCUEPTP * PAGE_SIZE; + + /* + * The kernel must be loaded at a 2MB-aligned address. To simplify + * location of the parameter structure, we require that the parameters, + * EFI map, bootstrap page table pages, and rescue kernel environment + * all fit in the first 2MB of the reservation. + */ + off = round_2mpage(off); + if (off != RESCUE_RESERV_KERNEL_OFFSET) { + printf("rescue: kernel metadata is too large\n"); + goto out; + } + params->kp_kernstart = rescue_pa + off; + + /* + * Copy the kernel image. This must come last since the length might + * not include that of allocated sections (i.e., .bss) depending on how + * the kernel was linked. + */ + if (rescue_memcpy(off, &rescue_start, kernlen, NULL)) { + printf("rescue: failed to copy kernel image\n"); + goto out; + } + ehdr = (Elf64_Ehdr *)((vm_offset_t)rescue_va + off); + if (!IS_ELF(*ehdr)) { + printf("rescue: kernel image is not an ELF file\n"); + goto out; + } + phdr = (Elf64_Phdr *)((vm_offset_t)ehdr + ehdr->e_phoff); + for (int i = 0; i < ehdr->e_phnum; i++) { + vm_offset_t foff; + + /* + * Zero out any segments that need it, i.e., the BSS. + */ + if (phdr[i].p_type != PT_LOAD || + phdr[i].p_filesz >= phdr[i].p_memsz) + continue; + foff = phdr[i].p_offset + phdr[i].p_filesz; + if (rescue_memset(off + foff, 0, + phdr[i].p_memsz - phdr[i].p_filesz)) { + printf("rescue: failed to zero BSS\n"); + goto out; + } + } + + /* + * Free the original copy of the rescue kernel: we don't need it + * anymore, and this releases a significant amount of memory, especially + * if the rescue kernel contains an embedded root filesystem. + */ + freeorig = 1; + TUNABLE_INT_FETCH("debug.rescue_free_kernel", &freeorig); + if (freeorig) + kmem_bootstrap_free((vm_offset_t)&rescue_start, kernlen); + + /* + * Finally tell the generic kernel dump layer that a dump device + * exists, so that it calls into rescue_kernel_exec(). + */ + memset(&di, 0, sizeof(di)); + di.dumper = rescue_dumper_dummy; + memset(&kda, 0, sizeof(kda)); + kda.kda_index = 0; /* highest priority */ + error = dumper_insert(&di, "rescue", &kda); + if (error != 0) { + printf("rescue: failed to set dump device: %d\n", error); + goto out; + } + + do_rescue_minidump = true; + printf("rescue: initialized\n"); + return; + +out: + if (rescue_va != NULL) { + kmem_free(rescue_va, memsize); + rescue_va = NULL; + rescue_pa = 0; + rescue_memsize = 0; + } + rescue_ptps = NULL; +} +SYSINIT(rescue_kernel, SI_SUB_VM_CONF, SI_ORDER_ANY, rescue_kernel_init, NULL); diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -105,6 +105,9 @@ options NETDUMP # netdump(4) client support options NETGDB # netgdb(4) client support +#options RESCUE_SUPPORT +#makeoptions RESCUE_EMBED=/path/to/rescue/kernel + # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel diff --git a/sys/amd64/conf/RESCUE b/sys/amd64/conf/RESCUE new file mode 100644 --- /dev/null +++ b/sys/amd64/conf/RESCUE @@ -0,0 +1,21 @@ +# MINIMAL would be good here, but it strips out disk drivers. +include "./GENERIC" + +ident RESCUE + +nooptions RESCUE_SUPPORT +nomakeoptions RESCUE_EMBED +makeoptions RESCUE_EMBED="no" + +makeoptions NO_MODULES= + +# Try to slim down the kernel itself. +options NO_SYSCTL_DESCR +nooptions WITNESS + +#options MD_ROOT +#options MD_ROOT_READONLY +#makeoptions MFS_IMAGE=/root/rescue-amd64.fs + +options RESCUE +#options ZFS diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -433,11 +433,18 @@ #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) +#ifdef RESCUE_SUPPORT +/* + * The rescue image is embedded into the kernel but may be freed. + */ +#define pmap_vm_page_alloc_check(m) +#else #define pmap_vm_page_alloc_check(m) \ KASSERT(m->phys_addr < kernphys || \ m->phys_addr >= kernphys + (vm_offset_t)&_end - KERNSTART, \ - ("allocating kernel page %p pa %#lx kernphys %#lx end %p", \ - m, m->phys_addr, kernphys, &_end)); + ("allocating kernel page %p pa %#lx kernphys %#lx end %p", \ + m, m->phys_addr, kernphys, &_end)) +#endif struct thread; diff --git a/sys/amd64/include/rescue.h b/sys/amd64/include/rescue.h new file mode 100644 --- /dev/null +++ b/sys/amd64/include/rescue.h @@ -0,0 +1,77 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RESCUE_H_ +#define _RESCUE_H_ + +/* + * Dump parameters passed from the panicked kernel to the rescue kernel. Some + * of these are known at compile-time, but pass them anyway to avoid surprises. + */ +struct rescue_dump_params { + vm_paddr_t dp_msgbufpa; /* message buffer physaddr */ + vm_size_t dp_msgbufsz; /* message buffer size */ + vm_paddr_t dp_vmdumppa; /* vm_page_dump[] physaddr */ + vm_size_t dp_vmdumpsz; /* vm_page_dump[] size (bytes) */ + vm_paddr_t dp_dumpavailpa; /* dump_avail[] physaddr */ + vm_paddr_t dp_kernpml4pa; /* PML4 page table page physaddr */ + vm_offset_t dp_kernstart; /* beginning of KVA */ + vm_offset_t dp_kernend; /* end of mapped KVA */ + vm_offset_t dp_kernmax; /* maximum KVA */ + vm_offset_t dp_dmapmin; /* beginning of direct map range */ + vm_offset_t dp_dmapmax; /* end of direct map range */ +}; + +/* + * Memory layout parameters passed to the rescue kernel. These are used to + * bootstrap the kernel and to initialize the dumper. + */ +struct rescue_kernel_params { + struct rescue_dump_params kp_dumpparams; + vm_paddr_t kp_efimapstart; + vm_size_t kp_efimaplen; + vm_paddr_t kp_smapstart; + vm_size_t kp_smaplen; + vm_paddr_t kp_kenvstart; + vm_size_t kp_kenvlen; + vm_paddr_t kp_kernstart; + vm_paddr_t kp_efifbaddr; + int kp_boothowto; +}; + +/* + * The rescue kernel is copied at this offset into the rescue reservation. The + * offset must be a multiple of 2MB. + */ +#define RESCUE_RESERV_KERNEL_OFFSET NBPDR + +extern bool do_rescue_minidump; + +extern void rescue_kernel_exec(void); +extern void rescue_dumper_init(struct rescue_dump_params *); + +#endif /* !_RESCUE_H_ */ diff --git a/sys/arm/arm/machdep_boot.c b/sys/arm/arm/machdep_boot.c --- a/sys/arm/arm/machdep_boot.c +++ b/sys/arm/arm/machdep_boot.c @@ -467,19 +467,8 @@ printf("\n"); } - switch (p->md_type) { - case EFI_MD_TYPE_CODE: - case EFI_MD_TYPE_DATA: - case EFI_MD_TYPE_BS_CODE: - case EFI_MD_TYPE_BS_DATA: - case EFI_MD_TYPE_FREE: - /* - * We're allowed to use any entry with these types. - */ - break; - default: + if (!efi_physmem_type(p->md_type)) continue; - } j++; if (j >= FDT_MEM_REGIONS) diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -87,6 +87,10 @@ #include #include +#ifdef RESCUE +#include +#endif + #ifdef VFP #include #endif @@ -725,6 +729,183 @@ } +#ifdef RESCUE +static vm_offset_t +preload_add_data(vm_offset_t dst, const void *src, uint32_t type, uint32_t size) +{ + uint32_t *data; + + data = (uint32_t *)dst; + *data++ = type; + *data++ = size; + memcpy_early(data, src, size); + return ((vm_offset_t)data + roundup2(size, sizeof(void *))); +} + +static vm_offset_t +preload_add_string(vm_offset_t dst, uint32_t type, const char *s) +{ + return (preload_add_data(dst, s, type, strlen(s) + 1)); +} + +static vm_offset_t +preload_add_u64(vm_offset_t dst, uint32_t type, uint64_t val) +{ + return (preload_add_data(dst, &val, type, sizeof(val))); +} + +static vm_offset_t +preload_add_efimap(vm_offset_t dst, struct efi_map_header *efihdr) +{ + uint32_t size; + + size = roundup2(sizeof(struct efi_map_header), 16) + + efihdr->memory_size; + return (preload_add_data(dst, efihdr, + MODINFO_METADATA | MODINFOMD_EFI_MAP, size)); +} + +static vm_offset_t +preload_add_efifb(vm_offset_t dst, struct efi_fb *efifb) +{ + return (preload_add_data(dst, efifb, + MODINFO_METADATA | MODINFOMD_EFI_FB, sizeof(struct efi_fb))); +} + +static vm_offset_t +preload_add_terminator(vm_offset_t dst) +{ + memset_early((void *)dst, 0, sizeof(uint32_t) * 2); + return (dst + sizeof(uint32_t) * 2); +} + +/* + * Fake some preloaded metadata for the rescue kernel using parameters passed by + * the panicked kernel. + */ +static vm_offset_t +rescue_preload_init(void) +{ + extern u_long _end; + static pd_entry_t l1[Ln_ENTRIES] __aligned(PAGE_SIZE); + static pd_entry_t l2[Ln_ENTRIES] __aligned(PAGE_SIZE); + pd_entry_t pde, *l1p, *l2p; + struct rescue_kernel_params *params; + pd_entry_t *l0; + void *efimap, *efifb; + uint64_t ttbr0; + vm_offset_t delta, dtb, env, kernend, md, mdstart, off, paramsva; + + /* + * Find the physical load address of the kernel. Add "delta" to a + * physical address to get the corresponding virtual address relative to + * KERNBASE. + */ + delta = KERNBASE - + (arm64_address_translate_s1e1r(KERNBASE) & PAR_PA_MASK); + + /* + * Fetch the boot parameters and DTB/EFI map from the 2MB region + * physically preceding the kernel. We cannot assume that this region + * is mapped, so we determine its physical address and then map it via + * TTBR0 using a single L2 block entry. Then the DTB and environment + * are copied to a region following the kernel, so this mapping can be + * transient, though currently we don't tear it down. + * + * Take care to avoid clobbering the existing identity map, in case + * initarm() intends to use it. + */ + _Static_assert(KERNBASE == VM_MIN_KERNEL_ADDRESS, + "kernel does not start at TTBR1 base"); + paramsva = KERNBASE - delta - RESCUE_RESERV_KERNEL_OFFSET; + + ttbr0 = READ_SPECIALREG(ttbr0_el1) & TTBR_BADDR; + l0 = (pd_entry_t *)(uintptr_t)(ttbr0 + delta); + pde = l0[pmap_l0_index(paramsva)]; + if (pde == 0) { + l0[pmap_l0_index(paramsva)] = L0_TABLE | + ((uintptr_t)l1 - delta); + l1p = l1; + } else { + l1p = (pd_entry_t *)((pde & ~ATTR_MASK) + delta); + } + pde = l1p[pmap_l1_index(paramsva)]; + if (pde == 0) { + l1p[pmap_l1_index(paramsva)] = L1_TABLE | + ((uintptr_t)l2 - delta); + l2p = l2; + } else { + /* Currently locore does not create L1_BLOCK entries. */ + KASSERT((pde & ATTR_DESCR_MASK) == L1_TABLE, + ("invalid L1 entry %#lx", pde)); + l2p = (pd_entry_t *)((pde & ~ATTR_MASK) + delta); + } + KASSERT(l2p[pmap_l2_index(paramsva)] == 0, + ("L2 entry already exists for %#lx", paramsva)); + l2p[pmap_l2_index(paramsva)] = + L2_BLOCK | ATTR_DEFAULT | ATTR_S1_IDX(VM_MEMATTR_DEFAULT) | + ATTR_S1_nG | paramsva; + dsb(ishst); + cpu_tlb_flushID(); + + /* + * Okay, we can access our parameters now. Copy the DTB/EFI map and + * environment strings to memory following the kernel. This ensures + * that they remain mapped after the pmap is bootstrapped. This relies + * on locore providing some extra space in region following the kernel + * mapped by TTBR1. + */ + params = (struct rescue_kernel_params *)paramsva; + off = round_page((uintptr_t)&_end); + if (params->kp_dtbstart != 0) { + dtb = off; + memcpy_early((void *)dtb, (void *)params->kp_dtbstart, + params->kp_dtblen); + off += round_page(params->kp_dtblen); + } else if (params->kp_efimapstart != 0) { + efimap = (void *)off; + memcpy_early(efimap, (void *)params->kp_efimapstart, + params->kp_efimaplen); + off += round_page(params->kp_efimaplen); + } + if (params->kp_efifbaddr != 0) { + efifb = (void *)off; + memcpy_early(efifb, (void *)params->kp_efifbaddr, + sizeof(struct efi_fb)); + off += round_page(sizeof(struct efi_fb)); + } + + env = off; + memcpy_early((void *)env, (void *)params->kp_kenvstart, + params->kp_kenvlen); + off += round_page(params->kp_kenvlen); + + md = mdstart = off; + kernend = mdstart + PAGE_SIZE; + + md = preload_add_string(md, MODINFO_NAME, "kernel"); + md = preload_add_string(md, MODINFO_TYPE, "elf kernel"); + md = preload_add_u64(md, MODINFO_ADDR, VM_MIN_KERNEL_ADDRESS); + md = preload_add_u64(md, MODINFO_SIZE, (uintptr_t)&_end - KERNBASE); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_KERNEND, kernend); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_HOWTO, + params->kp_boothowto); + if (params->kp_dtbstart != 0) + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_DTBP, + dtb); + else if (params->kp_efimapstart != 0) + md = preload_add_efimap(md, efimap); + if (params->kp_efifbaddr != 0) + md = preload_add_efifb(md, efifb); + md = preload_add_u64(md, MODINFO_METADATA | MODINFOMD_ENVP, env); + preload_add_terminator(md); + + rescue_dumper_init(¶ms->kp_dumpparams); + + return (mdstart); +} +#endif /* RESCUE */ + #ifdef FDT static void try_load_dtb(caddr_t kmdp) @@ -896,6 +1077,16 @@ caddr_t kmdp; bool valid; +#ifdef RESCUE + /* + * The rescue kernel runs without any module metadata. The panicked + * kernel could provide it, but some variables, like the size of the + * loaded rescue kernel, can't easily be determined there. So, fake it + * here. + */ + abp->modulep = rescue_preload_init(); +#endif + TSRAW(&thread0, TS_ENTER, __func__, NULL); boot_el = abp->boot_el; diff --git a/sys/arm64/arm64/minidump_machdep.c b/sys/arm64/arm64/minidump_machdep.c --- a/sys/arm64/arm64/minidump_machdep.c +++ b/sys/arm64/arm64/minidump_machdep.c @@ -53,6 +53,9 @@ #include #include #include +#ifdef RESCUE_SUPPORT +#include +#endif CTASSERT(sizeof(struct kerneldumpheader) == 512); @@ -224,6 +227,14 @@ } dumpsize += PAGE_SIZE; +#ifdef RESCUE_SUPPORT + if (do_rescue_minidump) { + rescue_kernel_exec(); + /* Shouldn't return here unless something goes very wrong. */ + return (ENXIO); + } +#endif + dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c --- a/sys/arm64/arm64/mp_machdep.c +++ b/sys/arm64/arm64/mp_machdep.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,9 @@ #include #include #include +#ifdef RESCUE_SUPPORT +#include +#endif #include #ifdef VFP #include @@ -347,6 +351,13 @@ dbg_register_sync(NULL); #endif +#ifdef RESCUE_SUPPORT + if (dumping) { + /* Never returns. */ + rescue_kernel_exec(); + } +#endif + CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); CTR0(KTR_SMP, "IPI_STOP (restart)"); diff --git a/sys/arm64/arm64/rescue_dumper.c b/sys/arm64/arm64/rescue_dumper.c new file mode 100644 --- /dev/null +++ b/sys/arm64/arm64/rescue_dumper.c @@ -0,0 +1,471 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020, 2023 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +enum dump_segs { + DUMP_SEG_MDHDR = 0, /* minidump header */ + DUMP_SEG_MSGBUF, /* kernel message buffer */ + DUMP_SEG_DUMP_AVAIL, /* physical memory ranges */ + DUMP_SEG_BITMAP, /* vm_page_dump array */ + DUMP_SEG_PTPS, /* kernel page table pages */ + DUMP_SEG_PAGES, /* pages marked in vm_page_dump */ + DUMP_SEG_COUNT, +}; + +struct dump_seg { + vm_offset_t ds_addr; + vm_size_t ds_sz; +}; + +struct dump_softc { + struct minidumphdr *sc_mdhdr; + struct dump_seg sc_segs[DUMP_SEG_COUNT]; + vm_offset_t sc_kernl0; + vm_offset_t sc_scratchkva; + char *sc_scratchbuf; + u_long sc_npages; + off_t sc_cursor; +}; + +FEATURE(rescue, "rescue kernel dumper"); + +static MALLOC_DEFINE(M_DUMPER, "dumper", "Rescue dumper structures"); + +static struct rescue_dump_params params; + +void +rescue_dumper_init(struct rescue_dump_params *p) +{ + memcpy(¶ms, p, sizeof(params)); +} + +static void +dump_seg_init(struct dump_seg *seg, vm_offset_t addr, vm_size_t sz) +{ + seg->ds_addr = addr; + seg->ds_sz = sz; +} + +static vm_offset_t +map_host_seg(vm_paddr_t pa, vm_size_t size) +{ + vm_offset_t va; + + size = round_page(size + (pa & PAGE_MASK)); + va = kva_alloc(size); + if (va != 0) + pmap_kenter(va, size, pa & ~PAGE_MASK, VM_MEMATTR_WRITE_BACK); + return (va + (pa & PAGE_MASK)); +} + +static void +unmap_host_seg(struct dump_seg *seg) +{ + vm_offset_t va; + vm_size_t off, size; + + va = seg->ds_addr; + if (va == 0) + return; + + size = round_page(seg->ds_sz + (va & PAGE_MASK)); + va &= ~PAGE_MASK; + for (off = 0; off < size; off += PAGE_SIZE) + pmap_kremove(va + off); + kva_free(va, size); +} + +static void +dumper_cdevpriv_dtr(void *arg) +{ + struct dump_softc *sc; + + sc = arg; + + free(sc->sc_scratchbuf, M_DUMPER); + if (sc->sc_scratchkva != 0) + kva_free(sc->sc_scratchkva, PAGE_SIZE); + if (sc->sc_kernl0 != 0) + pmap_kremove(sc->sc_kernl0); + + unmap_host_seg(&sc->sc_segs[DUMP_SEG_BITMAP]); + unmap_host_seg(&sc->sc_segs[DUMP_SEG_DUMP_AVAIL]); + unmap_host_seg(&sc->sc_segs[DUMP_SEG_MSGBUF]); + + free(sc->sc_mdhdr, M_DUMPER); + free(sc, M_DUMPER); +} + +CTASSERT(sizeof(struct minidumphdr) <= PAGE_SIZE); + +static int +dumper_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + struct dump_softc *sc; + struct minidumphdr *mdhdr; + uint64_t *bitmap; + vm_offset_t va; + u_long i; + int error; + + sc = malloc(sizeof(*sc), M_DUMPER, M_WAITOK | M_ZERO); + + /* + * The minidump header gets padded out to a full page. + */ + mdhdr = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + (void)strcpy(mdhdr->magic, MINIDUMP_MAGIC); + mdhdr->version = MINIDUMP_VERSION; + mdhdr->msgbufsize = round_page(params.dp_msgbufsz); + mdhdr->bitmapsize = round_page(params.dp_vmdumpsz); + mdhdr->pmapsize = howmany(params.dp_kernend - params.dp_kernstart, + L2_SIZE) * PAGE_SIZE; + mdhdr->kernbase = params.dp_kernstart; + mdhdr->dmapphys = params.dp_dmapbasepa; + mdhdr->dmapbase = params.dp_dmapmin; + mdhdr->dmapend = params.dp_dmapmax; + mdhdr->dumpavailsize = round_page(sizeof(dump_avail)); + sc->sc_mdhdr = mdhdr; + + dump_seg_init(&sc->sc_segs[DUMP_SEG_MDHDR], (vm_offset_t)mdhdr, + PAGE_SIZE); + + /* + * Map the root kernel page table page. It is not included in the dump, + * but is needed in order to walk the page tables so it might as well be + * statically mapped. + * + * Also allocate a page of KVA to map the rest of the kernel page table + * pages during walks. + */ + sc->sc_kernl0 = map_host_seg(params.dp_kernl0pa, PAGE_SIZE); + if (sc->sc_kernl0 == 0) { + error = ENOMEM; + goto err; + } + sc->sc_scratchkva = kva_alloc(PAGE_SIZE); + if (sc->sc_scratchkva == 0) { + error = ENOMEM; + goto err; + } + + /* + * In some cases it is necessary to synthesize a fake page table page. + */ + sc->sc_scratchbuf = malloc(PAGE_SIZE, M_DUMPER, M_WAITOK | M_ZERO); + + /* + * Map segments of the host kernel that get included in the minidump. + */ + va = map_host_seg(params.dp_msgbufpa, mdhdr->msgbufsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_MSGBUF], va, mdhdr->msgbufsize); + + va = map_host_seg(params.dp_dumpavailpa, mdhdr->dumpavailsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_DUMP_AVAIL], va, + mdhdr->dumpavailsize); + + va = map_host_seg(params.dp_vmdumppa, mdhdr->bitmapsize); + if (va == 0) { + error = ENOMEM; + goto err; + } + dump_seg_init(&sc->sc_segs[DUMP_SEG_BITMAP], va, mdhdr->bitmapsize); + + /* + * Create a virtual dump segment for the kernel page tables and marked + * host pages. + */ + dump_seg_init(&sc->sc_segs[DUMP_SEG_PTPS], 0, mdhdr->pmapsize); + + sc->sc_npages = 0; + bitmap = (uint64_t *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + for (i = 0; i < mdhdr->bitmapsize / sizeof(uint64_t); i++) + sc->sc_npages += bitcount64(bitmap[i]); + dump_seg_init(&sc->sc_segs[DUMP_SEG_PAGES], 0, + sc->sc_npages * PAGE_SIZE); + + error = devfs_set_cdevpriv(sc, dumper_cdevpriv_dtr); + if (error != 0) + goto err; + + return (0); + +err: + dumper_cdevpriv_dtr(sc); + return (error); +} + +/* + * Map a host page directory page. + */ +static pd_entry_t * +map_pde(struct dump_softc *sc, pd_entry_t pde) +{ + vm_offset_t scratch; + + scratch = sc->sc_scratchkva; + pmap_kenter(scratch, PAGE_SIZE, pde & ~ATTR_MASK, + VM_MEMATTR_WRITE_BACK); + return ((pd_entry_t *)scratch); +} + +/* + * Return a host page table page mapping the specified virtual address. + */ +static void * +map_ptp(struct dump_softc *sc, vm_offset_t va) +{ + pd_entry_t *l0, *l1, *l2, *l3; + vm_paddr_t pa; + + KASSERT((va & L2_OFFSET) == 0, ("%s: unaligned VA %#lx", __func__, va)); + + l0 = (pd_entry_t *)sc->sc_kernl0 + pmap_l0_index(va); + if ((*l0 & ATTR_DESCR_MASK) != L0_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + l1 = map_pde(sc, *l0); + l1 = &l1[pmap_l1_index(va)]; + if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) { + /* Dump a 1GB mapping using a fake PTP. */ + pa = (*l1 & ~ATTR_MASK) | (va & L1_OFFSET); + l3 = (pd_entry_t *)sc->sc_scratchbuf; + for (int i = 0; i < Ln_ENTRIES; i++) + l3[i] = pa + (i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; + return (l3); + } + if ((*l1 & ATTR_DESCR_MASK) != L1_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + l2 = map_pde(sc, *l1); + l2 = &l2[pmap_l2_index(va)]; + if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) { + /* Dump a 2MB mapping using a fake PTP. */ + pa = *l2 & ~ATTR_MASK; + l3 = (pd_entry_t *)sc->sc_scratchbuf; + for (int i = 0; i < Ln_ENTRIES; i++) + l3[i] = pa + (i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; + return (l3); + } + if ((*l2 & ATTR_DESCR_MASK) != L2_TABLE) { + /* Invalid entry, return a zero-filled page. */ + memset(sc->sc_scratchbuf, 0, PAGE_SIZE); + return (sc->sc_scratchbuf); + } + + /* Dump the leaf page table page. */ + l3 = map_pde(sc, *l2); + return (l3); +} + +static int +dumper_read_seg(struct dump_softc *sc, enum dump_segs idx, struct dump_seg *seg, + off_t baseoff, struct uio *uio) +{ + off_t off; + int error; + + KASSERT(baseoff <= uio->uio_offset && + baseoff + seg->ds_sz > uio->uio_offset, + ("%s: invalid offset %#lx into seg at %#lx-%#lx", __func__, + uio->uio_offset, baseoff, baseoff + seg->ds_sz)); + + error = 0; + off = uio->uio_offset - baseoff; + switch (idx) { + case DUMP_SEG_MDHDR: + case DUMP_SEG_MSGBUF: + case DUMP_SEG_DUMP_AVAIL: + case DUMP_SEG_BITMAP: + /* Linear segments can simply be copied. */ + error = uiomove((char *)seg->ds_addr + off, seg->ds_sz - off, + uio); + break; + case DUMP_SEG_PTPS: + /* Dump leaf page table pages. */ + for (vm_offset_t va = + params.dp_kernstart + (off / PAGE_SIZE) * L2_SIZE; + va < params.dp_kernend; va += L2_SIZE) { + char *ptp; + + ptp = map_ptp(sc, va); + error = uiomove(ptp + (off & PAGE_MASK), + PAGE_SIZE - (off & PAGE_MASK), uio); + if (error != 0 || uio->uio_resid == 0) + break; + off = uio->uio_offset - baseoff; + } + break; + case DUMP_SEG_PAGES: { + struct bitset *bitset; + vm_paddr_t *avail, pa; + size_t bitsetsize; + off_t off1; + long bit; + + avail = (vm_paddr_t *)sc->sc_segs[DUMP_SEG_DUMP_AVAIL].ds_addr; + + /* Dump pages marked in the bitmap. This is non-destructive. */ + bitset = (struct bitset *)sc->sc_segs[DUMP_SEG_BITMAP].ds_addr; + bitsetsize = sc->sc_segs[DUMP_SEG_BITMAP].ds_sz; + off1 = 0; + BIT_FOREACH_ISSET(bitsetsize * NBBY, bit, bitset) { + int i; + + if (off1 < off) { + off1 += PAGE_SIZE; + continue; + } + + for (i = 0; avail[i + 1] != 0; i += 2) { + int npages; + + npages = howmany(avail[i + 1], PAGE_SIZE) - + avail[i] / PAGE_SIZE; + if (bit < npages) { + pa = avail[i] + bit * PAGE_SIZE; + break; + } + bit -= npages; + } + if (avail[i + 1] == 0) + panic("failed to map bit %ld to a page", bit); + + pmap_kenter(sc->sc_scratchkva, PAGE_SIZE, pa, + VM_MEMATTR_WRITE_BACK); + error = uiomove((char *)sc->sc_scratchkva + + (off % PAGE_SIZE), + PAGE_SIZE - (off % PAGE_SIZE), + uio); + if (error != 0) + break; + if (uio->uio_resid == 0) + break; + off = off1 = uio->uio_offset - baseoff; + } + break; + } + default: + panic("%s: unknown segment index %d", __func__, idx); + } + + return (error); +} + +static int +dumper_read(struct cdev *dev, struct uio *uio, int flags) +{ + struct dump_softc *sc; + struct dump_seg *seg; + off_t baseoff, off; + int error, i; + + error = devfs_get_cdevpriv((void **)&sc); + if (error != 0) + return (error); + + off = uio->uio_offset; + if (off < 0) + return (EINVAL); + + /* Seeks are not supported. */ + if (off != sc->sc_cursor) + return (ESPIPE); + + for (baseoff = 0, i = 0; i < DUMP_SEG_COUNT; i++) { + seg = &sc->sc_segs[i]; + if (off >= baseoff && off < baseoff + seg->ds_sz) { + error = dumper_read_seg(sc, i, seg, baseoff, uio); + break; + } + baseoff += seg->ds_sz; + MPASS((baseoff & PAGE_MASK) == 0); + } + + sc->sc_cursor = uio->uio_offset; + return (error); +} + +static struct cdevsw dumper_cdevsw = { + .d_version = D_VERSION, + .d_open = dumper_open, + .d_read = dumper_read, + .d_name = "dumper", +}; + +static int +dumper_modevent(module_t mod __unused, int type, void *data __unused) +{ + static struct cdev *dumper_dev; + + switch (type) { + case MOD_LOAD: + dumper_dev = make_dev(&dumper_cdevsw, 0, UID_ROOT, GID_WHEEL, + 0600, "dumper"); + break; + case MOD_UNLOAD: + destroy_dev(dumper_dev); + break; + } + return (0); +} +DEV_MODULE(dumper, dumper_modevent, NULL); +MODULE_VERSION(dumper, 1); diff --git a/sys/arm64/arm64/rescue_machdep.c b/sys/arm64/arm64/rescue_machdep.c new file mode 100644 --- /dev/null +++ b/sys/arm64/arm64/rescue_machdep.c @@ -0,0 +1,598 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020, 2023 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_platform.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef FDT +#include +#include +#endif + +bool do_rescue_minidump = false; + +/* + * Parameters for memory reserved for the rescue kernel. The boundary and + * alignment are fixed by the requirements of locore. The size is configurable + * but of course must be satisfiable by an allocation with the defined alignment + * and boundary requirements. + */ +#define RESCUE_RESERV_ALIGN (2 * 1024 * 1024u) /* 2MB */ +#define RESCUE_RESERV_BOUNDARY (1024 * 1024 * 1024u) /* 1GB */ +#define RESCUE_RESERV_DEFAULT_SIZE (128 * 1024 * 1024u) /* 128MB */ + +/* + * Environment variables beginning with this prefix are copied into the rescue + * kernel's environment with the prefix stripped. + */ +#define RESCUE_KENV_PREFIX "debug.rescue." +#define RESCUE_KENV_ENABLED "debug.rescue_minidump" +#define RESCUE_KENV_MEMSIZE "debug.rescue_memsize" + +static void *rescue_va; +static vm_paddr_t rescue_pa; +static vm_size_t rescue_memsize; + +/* + * Called from the host kernel to populate rescue dumper parameters. + * The returned structure is passed to the rescue kernel. + */ +static void +rescue_dump_params_init(struct rescue_dump_params *rdp) +{ + rdp->dp_msgbufpa = vtophys(msgbufp->msg_ptr); + rdp->dp_msgbufsz = msgbufp->msg_size; + rdp->dp_vmdumppa = vtophys(vm_page_dump); + rdp->dp_vmdumpsz = round_page(BITSET_SIZE(vm_page_dump_pages)); + rdp->dp_dumpavailpa = vtophys(dump_avail); + rdp->dp_kernl0pa = vtophys(kernel_pmap->pm_l0); + rdp->dp_kernstart = VM_MIN_KERNEL_ADDRESS; + rdp->dp_kernend = kernel_vm_end; + rdp->dp_kernmax = VM_MAX_KERNEL_ADDRESS; + rdp->dp_dmapbasepa = DMAP_MIN_PHYSADDR; + rdp->dp_dmapmin = DMAP_MIN_ADDRESS; + rdp->dp_dmapmax = DMAP_MAX_ADDRESS; +} + +static void +rescue_kernel_cpu_switch(void) +{ + struct pcpu *pcpu; + + pcpu = cpuid_to_pcpu[0]; + if (get_pcpu() != pcpu) { + CPU_SET_ATOMIC(pcpu->pc_cpuid, &started_cpus); + for (;;) + cpu_spinwait(); + } +} + +/* + * Make the final preparations to jump into the rescue kernel, and then do it. + */ +void +rescue_kernel_exec(void) +{ + static pd_entry_t pt_l0[Ln_ENTRIES] __aligned(PAGE_SIZE); + static pd_entry_t pt_l1[Ln_ENTRIES] __aligned(PAGE_SIZE); + static pd_entry_t pt_l2[Ln_ENTRIES] __aligned(PAGE_SIZE); + struct rescue_kernel_params *params; + void (*rescue)(u_long modulep); + Elf64_Ehdr *ehdr; + vm_paddr_t pa; + off_t entryoff; + + /* + * Switch to the boot CPU if we are not already on it. + */ + rescue_kernel_cpu_switch(); + + printf("rescue: preparing to exec rescue kernel\n"); + + /* + * Acknowledge any active interrupts to avoid leaving the PIC in an + * indeterminate state. Mute system errors: the rescue kernel will + * re-enable them once it's prepared to handle them. + */ + intr_isrc_reset(); + serror_disable(); + + /* + * Prepare the dump parameters structure for the rescue kernel. The + * rest of the parameters must already have been initialized. These + * will be accessed via an aliasing mapping, so make sure the cache is + * written back. + */ + params = rescue_va; + rescue_dump_params_init(¶ms->kp_dumpparams); + cpu_dcache_wb_range(params, sizeof(*params)); + + /* + * Construct an identity map for the rescue kernel's locore. This + * covers the entire reservation. Because it does not span a 1GB + * boundary, only three pages are needed. This will be replaced by + * locore. + */ + pt_l0[pmap_l0_index(rescue_pa)] = L0_TABLE | vtophys(pt_l1); + pt_l1[pmap_l1_index(rescue_pa)] = L1_TABLE | vtophys(pt_l2); + for (pa = rescue_pa; pa < rescue_pa + rescue_memsize; pa += L2_SIZE) + pt_l2[pmap_l2_index(pa)] = L2_BLOCK | ATTR_SF | pmap_sh_attr | + ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE) | ATTR_S1_nG | pa; + dsb(ishst); + + set_ttbr0(pmap_kextract((vm_offset_t)pt_l0)); + cpu_tlb_flushID(); + + ehdr = (Elf64_Ehdr *)((char *)rescue_pa + RESCUE_RESERV_KERNEL_OFFSET); + if (IS_ELF(*ehdr)) + entryoff = ehdr->e_entry - KERNBASE; + else + entryoff = 0; + + /* + * Jump to the entry point. Currently we pass a dummy module pointer to + * ensure that locore maps some memory following the rescue kernel, but + * this is really a hack to avoid modifying locore. + */ + rescue = (void *)(rescue_pa + RESCUE_RESERV_KERNEL_OFFSET + entryoff); + (rescue)(KERNBASE + rescue_memsize); +} + +/* + * Dummy function to satisfy the dumper interface. This should never be + * called. + */ +static int +rescue_dumper_dummy(void *priv, void *virtual, off_t offset, size_t length) +{ + printf("%s: unexpected call\n", __func__); + return (EOPNOTSUPP); +} + +/* + * Copy a buffer into the rescue kernel's memory reservation at the specified + * offset. Returns an error if the copy would overflow the reservation buffer. + */ +static int +rescue_memcpy(vm_offset_t off, const void *src, size_t size, vm_offset_t *offp) +{ + if (off >= rescue_memsize || off + size > rescue_memsize) + return (1); + + memcpy((char *)rescue_va + off, src, size); + if (offp != NULL) + *offp = off + size; + return (0); +} + +#ifdef FDT +/* + * Copy the DTB into the reserved region and update its memory map to restrict + * the rescue kernel's address space to the reservation. + */ +static size_t +rescue_kernel_init_fdt(vm_paddr_t pa, vm_offset_t off, unsigned long memsize) +{ + void *dtbp, *fdtp; + const uint32_t *addr_cellsp, *size_cellsp; + uint8_t *buf, *sb; + caddr_t kmdp; + size_t dtblen; + uint32_t addr_cells, size_cells; + int error, len, memoff, rootoff; + bool new_prop = false; + + /* + * Copy the DTB into the reserved area. It would be simpler to copy the + * kernel to the base of the reservation and copy the DTB to the space + * following the kernel, but we do not know the kernel's full size. + * Thus the DTB is copied first and the kernel is copied to the next + * 2MB-aligned address. + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, void *); + /* Allocate 1 extra page for any fixups needed. */ + dtblen = fdt_totalsize(dtbp) + PAGE_SIZE; + + fdtp = malloc(dtblen, M_TEMP, M_WAITOK | M_ZERO); + memcpy(fdtp, dtbp, dtblen); + + /* + * Fix up the DTB used by the rescue kernel: update the memory node to + * point at reserved memory, and delete the rescue and memreserve nodes. + */ + rootoff = fdt_path_offset(fdtp, "/"); + if (rootoff < 0) { + printf("rescue: failed to look up FDT root offset\n"); + return (0); + } + memoff = fdt_path_offset(fdtp, "/memory"); + if (memoff < 0) { + printf("rescue: failed to look up FDT memory offset, trying to fake one\n"); + memoff = fdt_add_subnode(fdtp, rootoff, "memory"); + if (memoff < 0) { + printf("rescue: failed to create the FDT memory node\n"); + return (0); + } + new_prop = true; + } + addr_cellsp = fdt_getprop(fdtp, rootoff, "#address-cells", NULL); + if (addr_cellsp == NULL) { + printf("rescue: failed to look up address-cells property\n"); + return (0); + } + size_cellsp = fdt_getprop(fdtp, rootoff, "#size-cells", NULL); + if (addr_cellsp == NULL || size_cellsp == NULL) { + printf("rescue: failed to look up address-cells property\n"); + return (0); + } + addr_cells = fdt32_to_cpu(*addr_cellsp); + size_cells = fdt32_to_cpu(*size_cellsp); + + len = (addr_cells + size_cells) * sizeof(uint32_t); + if (!new_prop && fdt_getprop(fdtp, memoff, "reg", &len) == NULL) { + printf("rescue: memory node has no reg property.\n"); + printf(" Will try to generate one\n"); + new_prop = true; + } + if (len < (addr_cells + size_cells) * sizeof(uint32_t)) { + printf("rescue: reg property too small\n"); + return (0); + } + sb = buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO); + if (addr_cells == 2) + *(uint64_t *)buf = cpu_to_fdt64(pa); + else + *(uint32_t *)buf = cpu_to_fdt32(pa); + buf += addr_cells * sizeof(uint32_t); + if (size_cells == 2) + *(uint64_t *)buf = cpu_to_fdt64(memsize); + else + *(uint32_t *)buf = cpu_to_fdt32(memsize); + if (new_prop) + error = fdt_setprop(fdtp, memoff, "reg", sb, len); + else + error = fdt_setprop_inplace(fdtp, memoff, "reg", sb, len); + + free(sb, M_TEMP); + if (error != 0) { + printf("rescue: failed to update reg property: %d\n", error); + return (0); + } + + if (rescue_memcpy(off, fdtp, dtblen, NULL) != 0) { + printf("rescue: failed to copy FDT\n"); + return (0); + } + + return (dtblen); +} +#endif + +static size_t +rescue_kernel_init_efimap(const struct efi_map_header *srchdr, vm_offset_t off, + unsigned long memsize) +{ + struct efi_map_header hdr; + const struct efi_md *srcmd; + struct efi_md *dstmd; + vm_offset_t start, end; + const size_t hdrsz = roundup2(sizeof(struct efi_map_header), 16); + int ndesc; + + start = end = off; + + memcpy(&hdr, srchdr, sizeof(hdr)); + end += hdrsz; + + /* + * Copy the memory map, excluding RAM entries that do not overlap with + * the rescue reservation. + */ + srcmd = (const struct efi_md *)((const uint8_t *)srchdr + hdrsz); + ndesc = srchdr->memory_size / srchdr->descriptor_size; + for (int i = 0; i < ndesc; i++) { + if (efi_physmem_type(srcmd->md_type)) { + if (srcmd->md_phys <= rescue_pa && + srcmd->md_phys + ptoa(srcmd->md_pages) >= + rescue_pa) { + dstmd = malloc(srchdr->descriptor_size, M_TEMP, + M_WAITOK); + memcpy(dstmd, srcmd, srchdr->descriptor_size); + dstmd->md_phys = rescue_pa; + dstmd->md_pages = atop(memsize); + bool err = rescue_memcpy(end, dstmd, + srchdr->descriptor_size, &end); + free(dstmd, M_TEMP); + if (err) + return (0); + } + } else if (rescue_memcpy(end, srcmd, srchdr->descriptor_size, + &end)) { + return (0); + } + srcmd = efi_next_descriptor(__DECONST(void *, srcmd), + srchdr->descriptor_size); + } + hdr.memory_size = end - start - sizeof(hdr); + if (rescue_memcpy(start, &hdr, sizeof(hdr), NULL)) + return (0); + + return (end - start); +} + +static void +rescue_kernel_init(void *arg __unused) +{ + extern u_long rescue_start, rescue_end; + struct dumperinfo di; + struct diocskerneldump_arg kda; + struct rescue_kernel_params *params; + char *p; + size_t kernlen, varlen; + vm_offset_t envstart, off; + unsigned long memsize; + int enabled, error; + + enabled = 0; + TUNABLE_INT_FETCH("debug.rescue_minidump", &enabled); + if (!enabled) + return; + if (!do_minidump) { + printf("rescue: minidumps are not enabled\n"); + return; + } + + kernlen = (u_long)&rescue_end - (u_long)&rescue_start; + + /* + * Figure how much memory we need to allocate. We allocate free memory + * for the rescue kernel, memory to hold the rescue kernel image, and + * 2MB for the environment and metadata, and for bootstrap page table + * pages. + */ + memsize = RESCUE_RESERV_DEFAULT_SIZE; + TUNABLE_ULONG_FETCH(RESCUE_KENV_MEMSIZE, &memsize); + memsize += round_page(kernlen); + memsize += L2_SIZE; + + /* + * First try to obtain memory below the 4GB boundary for the benefit of + * devices with limited DMA addressing capabilities. This might not be + * possible depending on the layout of the physical address space. + */ + rescue_va = kmem_alloc_contig(memsize, M_NOWAIT | M_ZERO | M_NODUMP, + 0, (vm_paddr_t)1 << 32, RESCUE_RESERV_ALIGN, RESCUE_RESERV_BOUNDARY, + VM_MEMATTR_DEFAULT); + if (rescue_va == NULL) { + rescue_va = kmem_alloc_contig(memsize, + M_NOWAIT | M_ZERO | M_NODUMP, + 0, ~(vm_paddr_t)0, RESCUE_RESERV_ALIGN, + RESCUE_RESERV_BOUNDARY, VM_MEMATTR_DEFAULT); + if (rescue_va == NULL) { + printf("rescue: failed to reserve contiguous memory\n"); + goto out; + } + } + rescue_pa = pmap_kextract((vm_offset_t)rescue_va); + if (rescue_pa >= VM_MAX_USER_ADDRESS) { + /* We might need to handle this case at some point. */ + printf("rescue: reserved memory cannot be mapped by TTBR0\n"); + goto out; + } + rescue_memsize = memsize; + + params = rescue_va; + off = roundup2(sizeof(*params), sizeof(void *)); + params->kp_boothowto = boothowto; + + switch (arm64_bus_method) { +#ifdef FDT + case ARM64_BUS_FDT: { + size_t dtblen; + + dtblen = rescue_kernel_init_fdt(rescue_pa, off, memsize); + if (dtblen == 0) + goto out; + params->kp_dtbstart = rescue_pa + off; + params->kp_dtblen = dtblen; + off += dtblen; + break; + } +#endif + case ARM64_BUS_ACPI: { + const struct efi_map_header *srchdr; + const struct efi_fb *efifb; + caddr_t kmdp; + size_t efimaplen; + + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + srchdr = (const struct efi_map_header *)preload_search_info( + kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); + if (srchdr == NULL) { + printf("rescue: failed to find EFI map\n"); + goto out; + } + + efimaplen = rescue_kernel_init_efimap(srchdr, off, memsize); + if (efimaplen == 0) + goto out; + params->kp_efimapstart = rescue_pa + off; + params->kp_efimaplen = efimaplen; + off += efimaplen; + + efifb = (const struct efi_fb *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_EFI_FB); + if (efifb != NULL) { + params->kp_efifbaddr = rescue_pa + off; + if (rescue_memcpy(off, efifb, sizeof(*efifb), &off)) { + printf( + "rescue: failed to copy EFI framebuffer\n"); + goto out; + } + } + break; + } + default: + printf("rescue: unsupported bus method %d\n", arm64_bus_method); + goto out; + } + + /* + * Copy the host kernel's environment, with three differences: + * 1. SMP is disabled. + * 2. debug.rescue_minidump=1 from the host is omitted. + * 3. Any tunables prefixed by debug.rescue are copied without the + * prefix. This provides a mechanism to override host tunables + * if needed. Prefixed tunables are copied first since tunable + * lookups are first-match. + */ + envstart = off; + p = "kern.smp.disabled=1"; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + for (int i = 0; kenvp[i] != NULL; i++) { + p = kenvp[i]; + if (strncmp(p, RESCUE_KENV_PREFIX, + sizeof(RESCUE_KENV_PREFIX) - 1) != 0) + continue; + p += sizeof(RESCUE_KENV_PREFIX) - 1; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + } + for (int i = 0; kenvp[i] != NULL; i++) { + p = kenvp[i]; + if (strncmp(p, RESCUE_KENV_PREFIX, + sizeof(RESCUE_KENV_PREFIX) - 1) == 0) + continue; + varlen = strlen(p) + 1; + if (rescue_memcpy(off, p, varlen, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + } + p = "\0"; + if (rescue_memcpy(off, p, 1, &off)) { + printf("rescue: failed to copy tunable\n"); + goto out; + } + params->kp_kenvstart = rescue_pa + envstart; + params->kp_kenvlen = off - envstart; + + /* + * The kernel must be loaded at a 2MB-aligned address. To simplify + * location of the parameter structure, we require that the parameters, + * DTB and rescue kernel environment all fit in the first 2MB of the + * reservation. + */ + off = roundup2(off, L2_SIZE); + if (off != RESCUE_RESERV_KERNEL_OFFSET) { + printf("rescue: kernel metadata is too large\n"); + goto out; + } + params->kp_kernstart = rescue_pa + off; + + /* + * Copy the kernel image. This must come last since the file size may + * not include that of allocated segments. + */ + if (rescue_memcpy(off, &rescue_start, kernlen, NULL)) { + printf("rescue: failed to copy kernel image\n"); + goto out; + } + cpu_dcache_wbinv_range(rescue_va, memsize); + arm64_aliasing_icache_sync_range((vm_offset_t)rescue_va, + memsize); + + /* + * Finally tell the generic kernel dump layer that a dump device + * exists, so that it calls into rescue_kernel_exec(). + */ + memset(&di, 0, sizeof(di)); + di.dumper = rescue_dumper_dummy; + memset(&kda, 0, sizeof(kda)); + kda.kda_index = 0; /* highest priority */ + error = dumper_insert(&di, "rescue", &kda); + if (error != 0) { + printf("rescue: failed to set dump device: %d\n", error); + goto out; + } + + do_rescue_minidump = true; + printf("rescue: initialized\n"); + return; + +out: + if (rescue_va != NULL) { + kmem_free(rescue_va, memsize); + rescue_va = NULL; + rescue_pa = 0; + rescue_memsize = 0; + } +} +SYSINIT(rescue_kernel, SI_SUB_VM_CONF, SI_ORDER_ANY, rescue_kernel_init, NULL); diff --git a/sys/arm64/conf/RESCUE b/sys/arm64/conf/RESCUE new file mode 100644 --- /dev/null +++ b/sys/arm64/conf/RESCUE @@ -0,0 +1,19 @@ +include "./GENERIC" + +ident RESCUE + +nooptions RESCUE_SUPPORT +nomakeoptions RESCUE_EMBED +makeoptions RESCUE_EMBED="no" + +makeoptions NO_MODULES= + +#makeoptions MFS_IMAGE=/root/rescue.img + +# Try to keep the rescue kernel small. Ideally we could use a MINIMAL config, +# but none exists for arm64. +options NO_SYSCTL_DESCR +nooptions WITNESS +nooptions SMP + +options RESCUE diff --git a/sys/arm64/conf/std.arm64 b/sys/arm64/conf/std.arm64 --- a/sys/arm64/conf/std.arm64 +++ b/sys/arm64/conf/std.arm64 @@ -90,6 +90,9 @@ options DEBUGNET # debugnet networking options NETDUMP # netdump(4) client support +#options RESCUE_SUPPORT +#makeoptions RESCUE_EMBED=/path/to/rescue/kernel + # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h --- a/sys/arm64/include/cpufunc.h +++ b/sys/arm64/include/cpufunc.h @@ -98,6 +98,13 @@ __asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")"); } +static __inline void +serror_disable(void) +{ + + __asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")"); +} + static __inline register_t get_midr(void) { diff --git a/sys/arm64/include/rescue.h b/sys/arm64/include/rescue.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/rescue.h @@ -0,0 +1,78 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020, 2023 Juniper Networks Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RESCUE_H_ +#define _RESCUE_H_ + +/* + * Dump parameters passed from the panicked kernel to the rescue kernel. Some + * of these are known at compile-time, but pass them anyway to avoid surprises. + */ +struct rescue_dump_params { + vm_paddr_t dp_msgbufpa; /* message buffer physaddr */ + vm_size_t dp_msgbufsz; /* message buffer size */ + vm_paddr_t dp_vmdumppa; /* vm_page_dump physaddr */ + vm_size_t dp_vmdumpsz; /* vm_page_dump size (bytes) */ + vm_paddr_t dp_dumpavailpa; /* dump_avail[] physaddr */ + vm_paddr_t dp_kernl0pa; /* L0 page table page physaddr */ + vm_offset_t dp_kernstart; /* beginning of KVA */ + vm_offset_t dp_kernend; /* end of mapped KVA */ + vm_offset_t dp_kernmax; /* maximum KVA */ + vm_paddr_t dp_dmapbasepa; /* lowest addr mapped by direct map */ + vm_offset_t dp_dmapmin; /* beginning of direct map range */ + vm_offset_t dp_dmapmax; /* end of direct map range */ +}; + +/* + * Memory layout parameters passed to the rescue kernel. These are used to + * bootstrap the kernel and to initialize the dumper. + */ +struct rescue_kernel_params { + struct rescue_dump_params kp_dumpparams; + vm_paddr_t kp_dtbstart; + vm_size_t kp_dtblen; + vm_paddr_t kp_efimapstart; + vm_size_t kp_efimaplen; + vm_paddr_t kp_efifbaddr; + vm_paddr_t kp_kenvstart; + vm_size_t kp_kenvlen; + vm_paddr_t kp_kernstart; + int kp_boothowto; +}; + +/* + * The rescue kernel is copied at this offset into the rescue reservation. The + * offset must be a multiple of 2MB. + */ +#define RESCUE_RESERV_KERNEL_OFFSET L2_SIZE + +extern bool do_rescue_minidump; + +extern void rescue_dumper_init(struct rescue_dump_params *); +extern void rescue_kernel_exec(void); + +#endif /* !_RESCUE_H_ */ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -88,6 +88,8 @@ amd64/amd64/mpboot.S optional smp amd64/amd64/pmap.c standard amd64/amd64/ptrace_machdep.c standard +amd64/amd64/rescue_dumper.c optional rescue +amd64/amd64/rescue_machdep.c optional rescue_support amd64/amd64/support.S standard amd64/amd64/sys_machdep.c standard amd64/amd64/trap.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -70,6 +70,8 @@ compile-with "${NORMAL_C:N-mbranch-protection*} -mbranch-protection=bti" arm64/arm64/pmap.c standard arm64/arm64/ptrace_machdep.c standard +arm64/arm64/rescue_dumper.c optional rescue +arm64/arm64/rescue_machdep.c optional rescue_support arm64/arm64/sdt_machdep.c optional kdtrace_hooks arm64/arm64/sigtramp.S standard arm64/arm64/stack_machdep.c optional ddb | stack diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk --- a/sys/conf/kern.post.mk +++ b/sys/conf/kern.post.mk @@ -477,4 +477,11 @@ .endif .endif +.if ${RESCUE_EMBED:Uno} != "no" +rescue.o: ${RESCUE_EMBED} $S/dev/md/embedfs.S + ${CC} ${CFLAGS} ${ACFLAGS} -DMFS_IMAGE=\""${RESCUE_EMBED}"\" \ + -DSYM=rescue_start -DSYM_END=rescue_end -c \ + $S/dev/md/embedfs.S -o ${.TARGET} +.endif + .include "kern.mk" diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -369,6 +369,9 @@ SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o .endif .endif +.if ${RESCUE_EMBED:Uno} != "no" +SYSTEM_OBJS+= rescue.o +.endif SYSTEM_LD_BASECMD= \ ${LD} -m ${LD_EMULATION} -Bdynamic -L $S/conf -T ${LDSCRIPT} ${_LDFLAGS} \ --no-warn-mismatch --warn-common --export-dynamic \ diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64 --- a/sys/conf/options.amd64 +++ b/sys/conf/options.amd64 @@ -68,3 +68,17 @@ # x86 specific uart options UART_NS8250_EARLY_PORT opt_uart.h + +# +# Compile the kernel to be run as a rescue kernel after a panic +# and enable dumping the host kernel's memory. +# +RESCUE opt_global.h + +# +# Enable recovery of a memory dump by a rescue kernel. The rescue kernel must +# be compiled with the RESCUE option configured, and the rescue kernel image +# must be embedded by setting the RESCUE_EMBED make option to the path of a +# rescue kernel. The RESCUE and RESCUE_SUPPORT options are mutually exclusive. +# +RESCUE_SUPPORT opt_global.h diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -19,6 +19,20 @@ # EFI Runtime services support EFIRT opt_efirt.h +# +# Compile the kernel to be run as a rescue kernel after a panic +# and enable dumping the host kernel's memory. +# +RESCUE opt_global.h + +# +# Enable recovery of a memory dump by a rescue kernel. The rescue kernel must +# be compiled with the RESCUE option configured, and the rescue kernel image +# must be embedded by setting the RESCUE_EMBED make option to the path of a +# rescue kernel. The RESCUE and RESCUE_SUPPORT options are mutually exclusive. +# +RESCUE_SUPPORT opt_global.h + # Bhyve VMM opt_global.h diff --git a/sys/dev/md/embedfs.S b/sys/dev/md/embedfs.S --- a/sys/dev/md/embedfs.S +++ b/sys/dev/md/embedfs.S @@ -32,13 +32,25 @@ #include +/* + * XXX-MJ this is hackish. + * Should we just introduce a different file for rescue kernels? + */ +#ifndef SYM +#define SYM mfs_root +#endif + +#ifndef SYM_END +#define SYM_END mfs_root_end +#endif + .section mfs, "a", %progbits - .globl mfs_root - .type mfs_root, %object -mfs_root: + .globl SYM + .type SYM, %object +SYM: .incbin MFS_IMAGE - .size mfs_root, . - mfs_root - .globl mfs_root_end - .type mfs_root_end, %object -mfs_root_end: - .size mfs_root_end, . - mfs_root_end + .size SYM, . - SYM + .globl SYM_END + .type SYM_END, %object +SYM_END: + .size SYM_END, . - SYM_END diff --git a/sys/dev/xen/bus/xen_intr.c b/sys/dev/xen/bus/xen_intr.c --- a/sys/dev/xen/bus/xen_intr.c +++ b/sys/dev/xen/bus/xen_intr.c @@ -52,6 +52,8 @@ #include #include +#include + #include #include #include diff --git a/sys/kern/subr_intr.c b/sys/kern/subr_intr.c --- a/sys/kern/subr_intr.c +++ b/sys/kern/subr_intr.c @@ -180,6 +180,13 @@ SYSCTL_UINT(_machdep, OID_AUTO, nirq, CTLFLAG_RDTUN, &intr_nirq, 0, "Number of IRQs"); +#ifdef RESCUE_SUPPORT +DPCPU_DEFINE_STATIC(struct intr_irqsrc *, isrc_active); +#define ISRC_ACTIVE_SET(ptr) DPCPU_SET(isrc_active, (ptr)) +#else +#define ISRC_ACTIVE_SET(ptr) +#endif + /* Data for MI statistics reporting. */ u_long *intrcnt; char *intrnames; @@ -414,8 +421,11 @@ } else #endif if (isrc->isrc_event != NULL) { - if (intr_event_handle(isrc->isrc_event, tf) == 0) + ISRC_ACTIVE_SET(isrc); + if (intr_event_handle(isrc->isrc_event, tf) == 0) { + ISRC_ACTIVE_SET(NULL); return (0); + } } if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0) @@ -554,6 +564,24 @@ return (error); } +#ifdef RESCUE_SUPPORT +/* + * Make sure that active interrupts are acknowledged before executing the rescue + * kernel. Otherwise it will not be possible to reconfigure the PIC. + */ +void +intr_isrc_reset(void) +{ + struct intr_irqsrc *isrc; + + MPASS(KERNEL_PANICKED()); + + isrc = DPCPU_GET(isrc_active); + if (isrc != NULL) + PIC_POST_FILTER(isrc->isrc_dev, isrc); +} +#endif + #ifdef SMP /* * A support function for a PIC to decide if provided ISRC should be inited diff --git a/sys/sys/efi.h b/sys/sys/efi.h --- a/sys/sys/efi.h +++ b/sys/sys/efi.h @@ -155,6 +155,20 @@ }; #ifdef _KERNEL +/* + * Can a memory range of this type be included in phys_avail[]? + */ +static inline bool +efi_physmem_type(uint32_t type) +{ + if (type == EFI_MD_TYPE_CODE || + type == EFI_MD_TYPE_DATA || + type == EFI_MD_TYPE_BS_CODE || + type == EFI_MD_TYPE_BS_DATA || + type == EFI_MD_TYPE_FREE) + return (true); + return (false); +} #ifdef EFIABI_ATTR struct efi_rt { diff --git a/sys/sys/interrupt.h b/sys/sys/interrupt.h --- a/sys/sys/interrupt.h +++ b/sys/sys/interrupt.h @@ -189,6 +189,9 @@ int intr_getaffinity(int irq, int mode, void *mask); void *intr_handler_source(void *cookie); int intr_setaffinity(int irq, int mode, const void *mask); +#ifdef RESCUE_SUPPORT +void intr_rescue_exec(void); +#endif void _intr_drain(int irq); /* LinuxKPI only. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, diff --git a/sys/sys/intr.h b/sys/sys/intr.h --- a/sys/sys/intr.h +++ b/sys/sys/intr.h @@ -104,6 +104,9 @@ int intr_isrc_deregister(struct intr_irqsrc *); int intr_isrc_register(struct intr_irqsrc *, device_t, u_int, const char *, ...) __printflike(4, 5); +#ifdef RESCUE_SUPPORT +void intr_isrc_reset(void); +#endif #ifdef SMP bool intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu); diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -152,6 +152,8 @@ #ifndef LOCORE +#include + #define APIC_IPI_DEST_SELF -1 #define APIC_IPI_DEST_ALL -2 #define APIC_IPI_DEST_OTHERS -3 diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c --- a/sys/x86/x86/intr_machdep.c +++ b/sys/x86/x86/intr_machdep.c @@ -245,6 +245,24 @@ return (0); } +#ifdef RESCUE_SUPPORT +void +intr_rescue_exec(void) +{ + for (int v = 0; v < num_io_irqs; v++) { + struct intsrc *is; + + is = interrupt_sources[v]; + if (is == NULL) + continue; + if (is->is_pic->pic_disable_intr != NULL) { + is->is_pic->pic_disable_source(is, PIC_EOI); + is->is_pic->pic_disable_intr(is); + } + } +} +#endif + struct intsrc * intr_lookup_source(int vector) { diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1654,7 +1654,7 @@ * we don't lose an interrupt delivery race. */ td = curthread; - if (!rebooting) { + if (!rebooting && panicstr == NULL) { thread_lock(td); if (sched_is_bound(td)) panic("apic_free_vector: Thread already bound.\n");