diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c index de8433ae5a05..3b9c39ad58da 100644 --- a/sys/amd64/amd64/minidump_machdep.c +++ b/sys/amd64/amd64/minidump_machdep.c @@ -1,417 +1,417 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pmap.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t progress, dumpsize, wdog_next; static int dump_retry_count = 5; SYSCTL_INT(_machdep, OID_AUTO, dump_retry_count, CTLFLAG_RWTUN, &dump_retry_count, 0, "Number of times dump has to retry before bailing out"); static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, dump_va, 0, fragsz); + error = dump_append(di, dump_va, fragsz); fragsz = 0; return (error); } /* Pat the watchdog approximately every 128MB of the dump. */ #define WDOG_DUMP_INTERVAL (128 * 1024 * 1024) static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %p\n", ptr); return (EINVAL); } if (ptr != NULL) { /* If we're doing a virtual dump, flush any pre-existing pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; progress -= len; dumpsys_pb_progress(len); if (progress <= wdog_next) { wdog_kern_pat(WD_LASTVAL); if (wdog_next > WDOG_DUMP_INTERVAL) wdog_next -= WDOG_DUMP_INTERVAL; else wdog_next = 0; } if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A fake page table page, to avoid having to handle both 4K and 2M pages */ static pd_entry_t fakepd[NPDEPG]; int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { uint32_t pmapsize; vm_offset_t va, kva_end; int error; uint64_t *pml4, *pdp, *pd, *pt, pa; uint64_t pdpe, pde, pte; int ii, j, k, n; int retry_count; struct minidumphdr mdhdr; struct msgbuf *mbp; retry_count = 0; retry: retry_count++; /* Snapshot the KVA upper bound in case it grows. */ kva_end = MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end); /* * Walk the kernel page table pages, setting the active entries in the * dump bitmap. * * NB: for a live dump, we may be racing with updates to the page * tables, so care must be taken to read each entry only once. */ pmapsize = 0; for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) { /* * We always write a page, even if it is zero. Each * page written corresponds to 1GB of space */ pmapsize += PAGE_SIZE; ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]); if ((pdpe & PG_V) == 0) { va += NBPDP; continue; } /* * 1GB page is represented as 512 2MB pages in a dump. */ if ((pdpe & PG_PS) != 0) { va += NBPDP; pa = pdpe & PG_PS_FRAME; for (n = 0; n < NPDEPG * NPTEPG; n++) { if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); pa += PAGE_SIZE; } continue; } pd = (uint64_t *)PHYS_TO_DMAP(pdpe & PG_FRAME); for (n = 0; n < NPDEPG; n++, va += NBPDR) { pde = atomic_load_64(&pd[pmap_pde_index(va)]); if ((pde & PG_V) == 0) continue; if ((pde & PG_PS) != 0) { /* This is an entire 2M page. */ pa = pde & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (vm_phys_is_dumpable(pa)) vm_page_dump_add( state->dump_bitset, pa); pa += PAGE_SIZE; } continue; } pa = pde & PG_FRAME; /* set bit for this PTE page */ if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); /* and for each valid page in this 2MB block */ pt = (uint64_t *)PHYS_TO_DMAP(pde & PG_FRAME); for (k = 0; k < NPTEPG; k++) { pte = atomic_load_64(&pt[k]); if ((pte & PG_V) == 0) continue; pa = pte & PG_FRAME; if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); } } } /* Calculate dump size. */ mbp = state->msgbufp; dumpsize = pmapsize; dumpsize += round_page(mbp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { /* Clear out undumpable pages now if needed */ if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) { dumpsize += PAGE_SIZE; } else { vm_page_dump_drop(state->dump_bitset, pa); } } dumpsize += PAGE_SIZE; wdog_next = progress = dumpsize; dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump my header */ bzero(&fakepd, sizeof(fakepd)); bcopy(&mdhdr, &fakepd, sizeof(mdhdr)); error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(fakepd), "Large dump_avail not handled"); bzero(&fakepd, sizeof(fakepd)); memcpy(fakepd, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)fakepd, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)state->dump_bitset, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(fakepd, sizeof(fakepd)); for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) { ii = pmap_pml4e_index(va); pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii; pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]); /* We always write a page, even if it is zero */ if ((pdpe & PG_V) == 0) { error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; continue; } /* 1GB page is represented as 512 2MB pages in a dump */ if ((pdpe & PG_PS) != 0) { /* PDPE and PDP have identical layout in this case */ fakepd[0] = pdpe; for (j = 1; j < NPDEPG; j++) fakepd[j] = fakepd[j - 1] + NBPDR; error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; bzero(fakepd, sizeof(fakepd)); continue; } pa = pdpe & PG_FRAME; if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) { pd = (uint64_t *)PHYS_TO_DMAP(pa); error = blk_write(di, (char *)pd, 0, PAGE_SIZE); } else { /* Malformed pa, write the zeroed fakepd. */ error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE); } if (error) goto fail; error = blk_flush(di); if (error) goto fail; } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < dump_retry_count) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } diff --git a/sys/arm/arm/minidump_machdep.c b/sys/arm/arm/minidump_machdep.c index 80692ee1359c..8a74dc9818ba 100644 --- a/sys/arm/arm/minidump_machdep.c +++ b/sys/arm/arm/minidump_machdep.c @@ -1,327 +1,327 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * Copyright (c) 2008 Semihalf, Grzegorz Bernacki * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: FreeBSD: src/sys/i386/i386/minidump_machdep.c,v 1.6 2008/08/17 23:27:27 */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, dump_va, 0, fragsz); + error = dump_append(di, dump_va, fragsz); fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if (pa != 0) { if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if ((pa & PAGE_MASK) != 0) { printf("address not page aligned\n"); return (EINVAL); } } if (ptr != NULL) { /* Flush any pre-existing pa pages before a virtual dump. */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; dumpsys_pb_progress(len); wdog_kern_pat(WD_LASTVAL); if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A buffer for general use. Its size must be one page at least. */ static char dumpbuf[PAGE_SIZE] __aligned(sizeof(uint64_t)); CTASSERT(sizeof(dumpbuf) % sizeof(pt2_entry_t) == 0); int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { struct minidumphdr mdhdr; struct msgbuf *mbp; uint64_t dumpsize, *dump_avail_buf; uint32_t ptesize; uint32_t pa, prev_pa = 0, count = 0; vm_offset_t va, kva_end; int error, i; char *addr; /* * Flush caches. Note that in the SMP case this operates only on the * current CPU's L1 cache. Before we reach this point, code in either * the system shutdown or kernel debugger has called stop_cpus() to stop * all cores other than this one. Part of the ARM handling of * stop_cpus() is to call wbinv_all() on that core's local L1 cache. So * by time we get to here, all that remains is to flush the L1 for the * current CPU, then the L2. */ dcache_wbinv_poc_all(); /* Snapshot the KVA upper bound in case it grows. */ kva_end = kernel_vm_end; /* * Walk the kernel page table pages, setting the active entries in the * dump bitmap. */ ptesize = 0; for (va = KERNBASE; va < kva_end; va += PAGE_SIZE) { pa = pmap_dump_kextract(va, NULL); if (pa != 0 && vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); ptesize += sizeof(pt2_entry_t); } /* Calculate dump size. */ mbp = state->msgbufp; dumpsize = ptesize; dumpsize += round_page(mbp->msg_size); dumpsize += round_page(nitems(dump_avail) * sizeof(uint64_t)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { /* Clear out undumpable pages now if needed */ if (vm_phys_is_dumpable(pa)) dumpsize += PAGE_SIZE; else vm_page_dump_drop(state->dump_bitset, pa); } dumpsize += PAGE_SIZE; dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.ptesize = ptesize; mdhdr.kernbase = KERNBASE; mdhdr.arch = __ARM_ARCH; mdhdr.mmuformat = MINIDUMP_MMU_FORMAT_V6; mdhdr.dumpavailsize = round_page(nitems(dump_avail) * sizeof(uint64_t)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARM_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Physical memory: %u MB\n", ptoa((uintmax_t)physmem) / 1048576); printf("Dumping %llu MB:", (long long)dumpsize >> 20); /* Dump my header */ bzero(dumpbuf, sizeof(dumpbuf)); bcopy(&mdhdr, dumpbuf, sizeof(mdhdr)); error = blk_write(di, dumpbuf, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); if (error) goto fail; /* Dump dump_avail. Make a copy using 64-bit physical addresses. */ _Static_assert(nitems(dump_avail) * sizeof(uint64_t) <= sizeof(dumpbuf), "Large dump_avail not handled"); bzero(dumpbuf, sizeof(dumpbuf)); dump_avail_buf = (uint64_t *)dumpbuf; for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) { dump_avail_buf[i] = dump_avail[i]; dump_avail_buf[i + 1] = dump_avail[i + 1]; } error = blk_write(di, dumpbuf, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)state->dump_bitset, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page table pages */ addr = dumpbuf; for (va = KERNBASE; va < kva_end; va += PAGE_SIZE) { pmap_dump_kextract(va, (pt2_entry_t *)addr); addr += sizeof(pt2_entry_t); if (addr == dumpbuf + sizeof(dumpbuf)) { error = blk_write(di, dumpbuf, 0, sizeof(dumpbuf)); if (error != 0) goto fail; addr = dumpbuf; } } if (addr != dumpbuf) { error = blk_write(di, dumpbuf, 0, addr - dumpbuf); if (error != 0) goto fail; } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { if (!count) { prev_pa = pa; count++; } else { if (pa == (prev_pa + count * PAGE_SIZE)) count++; else { error = blk_write(di, NULL, prev_pa, count * PAGE_SIZE); if (error) goto fail; count = 1; prev_pa = pa; } } } if (count) { error = blk_write(di, NULL, prev_pa, count * PAGE_SIZE); if (error) goto fail; count = 0; prev_pa = 0; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) { printf("\nDump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } diff --git a/sys/arm64/arm64/minidump_machdep.c b/sys/arm64/arm64/minidump_machdep.c index e05a19fc1c41..3dfeb3dfef1e 100644 --- a/sys/arm64/arm64/minidump_machdep.c +++ b/sys/arm64/arm64/minidump_machdep.c @@ -1,395 +1,395 @@ /*- * Copyright (c) 2006 Peter Wemm * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t dumpsize; static uint64_t tmpbuffer[Ln_ENTRIES]; static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, dump_va, 0, fragsz); + error = dump_append(di, dump_va, fragsz); fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %p\n", ptr); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any * pre-existing pa pages. */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; dumpsys_pb_progress(len); wdog_kern_pat(WD_LASTVAL); if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error) return (error); ptr += len; sz -= len; } else { dump_va = (void *)PHYS_TO_DMAP(pa); fragsz += len; pa += len; sz -= len; error = blk_flush(di); if (error) return (error); } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { struct minidumphdr mdhdr; struct msgbuf *mbp; pd_entry_t *l0, *l1, l1e, *l2, l2e; pt_entry_t *l3, l3e; vm_offset_t va, kva_end; vm_paddr_t pa; uint32_t pmapsize; int error, i, j, retry_count; retry_count = 0; retry: retry_count++; error = 0; pmapsize = 0; /* Snapshot the KVA upper bound in case it grows. */ kva_end = kernel_vm_end; /* * Walk the kernel page table pages, setting the active entries in the * dump bitmap. * * NB: for a live dump, we may be racing with updates to the page * tables, so care must be taken to read each entry only once. */ for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += L2_SIZE) { pmapsize += PAGE_SIZE; if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) continue; l1e = atomic_load_64(l1); l2e = atomic_load_64(l2); if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { pa = l1e & ~ATTR_MASK; for (i = 0; i < Ln_ENTRIES * Ln_ENTRIES; i++, pa += PAGE_SIZE) if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); pmapsize += (Ln_ENTRIES - 1) * PAGE_SIZE; va += L1_SIZE - L2_SIZE; } else if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { pa = l2e & ~ATTR_MASK; for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) { if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); } } else if ((l2e & ATTR_DESCR_MASK) == L2_TABLE) { for (i = 0; i < Ln_ENTRIES; i++) { l3e = atomic_load_64(&l3[i]); if ((l3e & ATTR_DESCR_MASK) != L3_PAGE) continue; pa = l3e & ~ATTR_MASK; if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); } } } /* Calculate dump size. */ mbp = state->msgbufp; dumpsize = pmapsize; dumpsize += round_page(mbp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) dumpsize += PAGE_SIZE; else vm_page_dump_drop(state->dump_bitset, pa); } dumpsize += PAGE_SIZE; dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.dmapphys = DMAP_MIN_PHYSADDR; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); mdhdr.flags = MINIDUMP_FLAG_PS_4K; dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AARCH64_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump my header */ bzero(&tmpbuffer, sizeof(tmpbuffer)); bcopy(&mdhdr, &tmpbuffer, sizeof(mdhdr)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(tmpbuffer), "Large dump_avail not handled"); bzero(tmpbuffer, sizeof(tmpbuffer)); memcpy(tmpbuffer, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)state->dump_bitset, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(&tmpbuffer, sizeof(tmpbuffer)); for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += L2_SIZE) { if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) { /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse tmpbuffer in the same block*/ error = blk_flush(di); if (error) goto fail; continue; } l1e = atomic_load_64(l1); l2e = atomic_load_64(l2); if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { /* * Handle a 1GB block mapping: write out 512 fake L2 * pages. */ pa = (l1e & ~ATTR_MASK) | (va & L1_OFFSET); for (i = 0; i < Ln_ENTRIES; i++) { for (j = 0; j < Ln_ENTRIES; j++) { tmpbuffer[j] = (pa + i * L2_SIZE + j * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; } error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; } /* flush, in case we reuse tmpbuffer in the same block*/ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); va += L1_SIZE - L2_SIZE; } else if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { pa = (l2e & ~ATTR_MASK) | (va & L2_OFFSET); /* Generate fake l3 entries based upon the l1 entry */ for (i = 0; i < Ln_ENTRIES; i++) { tmpbuffer[i] = (pa + i * PAGE_SIZE) | ATTR_DEFAULT | L3_PAGE; } error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakepd in the same block */ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); continue; } else { pa = l2e & ~ATTR_MASK; /* * We always write a page, even if it is zero. If pa * is malformed, write the zeroed tmpbuffer. */ if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) error = blk_write(di, NULL, pa, PAGE_SIZE); else error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; } } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < 5) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } diff --git a/sys/ddb/db_textdump.c b/sys/ddb/db_textdump.c index 539b51e58e1c..aad8b6965318 100644 --- a/sys/ddb/db_textdump.c +++ b/sys/ddb/db_textdump.c @@ -1,564 +1,564 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Kernel text-dump support: write a series of text files to the dump * partition for later recovery, including captured DDB output, kernel * configuration, message buffer, and panic message. This allows for a more * compact representation of critical debugging information than traditional * binary dumps, as well as allowing dump information to be used without * access to kernel symbols, source code, etc. * * Storage Layout * -------------- * * Crash dumps are aligned to the end of the dump or swap partition in order * to minimize the chances of swap duing fsck eating into the dump. However, * unlike a memory dump, we don't know the size of the textdump a priori, so * can't just write it out sequentially in order from a known starting point * calculated with respect to the end of the partition. In order to address * this, we actually write out the textdump in reverse block order, allowing * us to directly align it to the end of the partition and then write out the * dump header and trailer before and after it once done. savecore(8) must * know to reverse the order of the blocks in order to produce a readable * file. * * Data is written out in the ustar file format so that we can write data * incrementally as a stream without reference to previous files. * * TODO * ---- * * - Allow subsystems to register to submit files for inclusion in the text * dump in a generic way. */ #include __FBSDID("$FreeBSD$"); #include "opt_config.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_debug_ddb, OID_AUTO, textdump, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DDB textdump options"); /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is * to protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64*1024) /* * Data is written out as a series of files in the ustar tar format. ustar * is a simple streamed format consiting of a series of files prefixed with * headers, and all padded to 512-byte block boundaries, which maps * conveniently to our requirements. */ struct ustar_header { char uh_filename[100]; char uh_mode[8]; char uh_tar_owner[8]; char uh_tar_group[8]; char uh_size[12]; char uh_mtime[12]; char uh_sum[8]; char uh_type; char uh_linkfile[100]; char uh_ustar[6]; char uh_version[2]; char uh_owner[32]; char uh_group[32]; char uh_major[8]; char uh_minor[8]; char uh_filenameprefix[155]; char uh_zeropad[12]; } __packed; /* * Various size assertions -- pretty much everything must be one block in * size. */ CTASSERT(sizeof(struct kerneldumpheader) == TEXTDUMP_BLOCKSIZE); CTASSERT(sizeof(struct ustar_header) == TEXTDUMP_BLOCKSIZE); /* * Is a textdump scheduled? If so, the shutdown code will invoke our dumpsys * routine instead of the machine-dependent kernel dump routine. */ #ifdef TEXTDUMP_PREFERRED int textdump_pending = 1; #else int textdump_pending = 0; #endif SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, pending, CTLFLAG_RW, &textdump_pending, 0, "Perform textdump instead of regular kernel dump."); /* * Various constants for tar headers and contents. */ #define TAR_USER "root" #define TAR_GROUP "wheel" #define TAR_UID "0" #define TAR_GID "0" #define TAR_MODE "0600" #define TAR_USTAR "ustar" #define TAR_CONFIG_FILENAME "config.txt" /* Kernel configuration. */ #define TAR_MSGBUF_FILENAME "msgbuf.txt" /* Kernel messsage buffer. */ #define TAR_PANIC_FILENAME "panic.txt" /* Panic message. */ #define TAR_VERSION_FILENAME "version.txt" /* Kernel version. */ /* * Configure which files will be dumped. */ #ifdef INCLUDE_CONFIG_FILE static int textdump_do_config = 1; SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_config, CTLFLAG_RW, &textdump_do_config, 0, "Dump kernel configuration in textdump"); #endif static int textdump_do_ddb = 1; SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_ddb, CTLFLAG_RW, &textdump_do_ddb, 0, "Dump DDB captured output in textdump"); static int textdump_do_msgbuf = 1; SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_msgbuf, CTLFLAG_RW, &textdump_do_msgbuf, 0, "Dump kernel message buffer in textdump"); static int textdump_do_panic = 1; SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_panic, CTLFLAG_RW, &textdump_do_panic, 0, "Dump kernel panic message in textdump"); static int textdump_do_version = 1; SYSCTL_INT(_debug_ddb_textdump, OID_AUTO, do_version, CTLFLAG_RW, &textdump_do_version, 0, "Dump kernel version string in textdump"); /* * State related to incremental writing of blocks to disk. */ static off_t textdump_offset; /* Offset of next sequential write. */ static int textdump_error; /* Carried write error, if any. */ /* * Statically allocate space to prepare block-sized headers and data. */ char textdump_block_buffer[TEXTDUMP_BLOCKSIZE]; static struct kerneldumpheader kdh; /* * Calculate and fill in the checksum for a ustar header. */ static void ustar_checksum(struct ustar_header *uhp) { u_int sum; int i; for (i = 0; i < sizeof(uhp->uh_sum); i++) uhp->uh_sum[i] = ' '; sum = 0; for (i = 0; i < sizeof(*uhp); i++) sum += ((u_char *)uhp)[i]; snprintf(uhp->uh_sum, sizeof(uhp->uh_sum), "%6o", sum); } /* * Each file in the tarball has a block-sized header with its name and other, * largely hard-coded, properties. */ void textdump_mkustar(char *block_buffer, const char *filename, u_int size) { struct ustar_header *uhp; #ifdef TEXTDUMP_VERBOSE if (textdump_error == 0) printf("textdump: creating '%s'.\n", filename); #endif uhp = (struct ustar_header *)block_buffer; bzero(uhp, sizeof(*uhp)); strlcpy(uhp->uh_filename, filename, sizeof(uhp->uh_filename)); strlcpy(uhp->uh_mode, TAR_MODE, sizeof(uhp->uh_mode)); snprintf(uhp->uh_size, sizeof(uhp->uh_size), "%o", size); strlcpy(uhp->uh_tar_owner, TAR_UID, sizeof(uhp->uh_tar_owner)); strlcpy(uhp->uh_tar_group, TAR_GID, sizeof(uhp->uh_tar_group)); strlcpy(uhp->uh_owner, TAR_USER, sizeof(uhp->uh_owner)); strlcpy(uhp->uh_group, TAR_GROUP, sizeof(uhp->uh_group)); snprintf(uhp->uh_mtime, sizeof(uhp->uh_mtime), "%lo", (unsigned long)time_second); uhp->uh_type = 0; strlcpy(uhp->uh_ustar, TAR_USTAR, sizeof(uhp->uh_ustar)); ustar_checksum(uhp); } /* * textdump_writeblock() writes TEXTDUMP_BLOCKSIZE-sized blocks of data to * the space between di->mediaoffset and di->mediaoffset + di->mediasize. It * accepts an offset relative to di->mediaoffset. If we're carrying any * error from previous I/O, return that error and don't continue to try to * write. Most writers ignore the error and forge ahead on the basis that * there's not much you can do. */ static int textdump_writeblock(struct dumperinfo *di, off_t offset, char *buffer) { if (textdump_error) return (textdump_error); if (offset + TEXTDUMP_BLOCKSIZE > di->mediasize) return (EIO); if (offset < SIZEOF_METADATA) return (ENOSPC); - textdump_error = dump_write(di, buffer, 0, offset + di->mediaoffset, + textdump_error = dump_write(di, buffer, offset + di->mediaoffset, TEXTDUMP_BLOCKSIZE); if (textdump_error) printf("textdump_writeblock: offset %jd, error %d\n", (intmax_t)offset, textdump_error); return (textdump_error); } /* * Interfaces to save and restore the dump offset, so that printers can go * back to rewrite a header if required, while avoiding their knowing about * the global layout of the blocks. * * If we ever want to support writing textdumps to tape or other * stream-oriented target, we'll need to remove this. */ void textdump_saveoff(off_t *offsetp) { *offsetp = textdump_offset; } void textdump_restoreoff(off_t offset) { textdump_offset = offset; } /* * Interface to write the "next block" relative to the current offset; since * we write backwards from the end of the partition, we subtract, but there's * no reason for the caller to know this. */ int textdump_writenextblock(struct dumperinfo *di, char *buffer) { int error; error = textdump_writeblock(di, textdump_offset, buffer); textdump_offset -= TEXTDUMP_BLOCKSIZE; return (error); } #ifdef INCLUDE_CONFIG_FILE extern char kernconfstring[]; /* * Dump kernel configuration. */ static void textdump_dump_config(struct dumperinfo *di) { u_int count, fullblocks, len; len = strlen(kernconfstring); textdump_mkustar(textdump_block_buffer, TAR_CONFIG_FILENAME, len); (void)textdump_writenextblock(di, textdump_block_buffer); /* * Write out all full blocks directly from the string, and handle any * left-over bits by copying it to out to the local buffer and * zero-padding it. */ fullblocks = len / TEXTDUMP_BLOCKSIZE; for (count = 0; count < fullblocks; count++) (void)textdump_writenextblock(di, kernconfstring + count * TEXTDUMP_BLOCKSIZE); if (len % TEXTDUMP_BLOCKSIZE != 0) { bzero(textdump_block_buffer, TEXTDUMP_BLOCKSIZE); bcopy(kernconfstring + count * TEXTDUMP_BLOCKSIZE, textdump_block_buffer, len % TEXTDUMP_BLOCKSIZE); (void)textdump_writenextblock(di, textdump_block_buffer); } } #endif /* INCLUDE_CONFIG_FILE */ /* * Dump kernel message buffer. */ static void textdump_dump_msgbuf(struct dumperinfo *di) { off_t end_offset, tarhdr_offset; u_int i, len, offset, seq, total_len; char buf[16]; /* * Write out a dummy tar header to advance the offset; we'll rewrite * it later once we know the true size. */ textdump_saveoff(&tarhdr_offset); textdump_mkustar(textdump_block_buffer, TAR_MSGBUF_FILENAME, 0); (void)textdump_writenextblock(di, textdump_block_buffer); /* * Copy out the data in small chunks, but don't copy nuls that may be * present if the message buffer has not yet completely filled at * least once. */ total_len = 0; offset = 0; msgbuf_peekbytes(msgbufp, NULL, 0, &seq); while ((len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq)) > 0) { for (i = 0; i < len; i++) { if (buf[i] == '\0') continue; textdump_block_buffer[offset] = buf[i]; offset++; if (offset != sizeof(textdump_block_buffer)) continue; (void)textdump_writenextblock(di, textdump_block_buffer); total_len += offset; offset = 0; } } total_len += offset; /* Without the zero-padding. */ if (offset != 0) { bzero(textdump_block_buffer + offset, sizeof(textdump_block_buffer) - offset); (void)textdump_writenextblock(di, textdump_block_buffer); } /* * Rewrite tar header to reflect how much was actually written. */ textdump_saveoff(&end_offset); textdump_restoreoff(tarhdr_offset); textdump_mkustar(textdump_block_buffer, TAR_MSGBUF_FILENAME, total_len); (void)textdump_writenextblock(di, textdump_block_buffer); textdump_restoreoff(end_offset); } static void textdump_dump_panic(struct dumperinfo *di) { u_int len; /* * Write out tar header -- we store up to one block of panic message. */ len = min(strlen(panicstr), TEXTDUMP_BLOCKSIZE); textdump_mkustar(textdump_block_buffer, TAR_PANIC_FILENAME, len); (void)textdump_writenextblock(di, textdump_block_buffer); /* * Zero-pad the panic string and write out block. */ bzero(textdump_block_buffer, sizeof(textdump_block_buffer)); bcopy(panicstr, textdump_block_buffer, len); (void)textdump_writenextblock(di, textdump_block_buffer); } static void textdump_dump_version(struct dumperinfo *di) { u_int len; /* * Write out tar header -- at most one block of version information. */ len = min(strlen(version), TEXTDUMP_BLOCKSIZE); textdump_mkustar(textdump_block_buffer, TAR_VERSION_FILENAME, len); (void)textdump_writenextblock(di, textdump_block_buffer); /* * Zero pad the version string and write out block. */ bzero(textdump_block_buffer, sizeof(textdump_block_buffer)); bcopy(version, textdump_block_buffer, len); (void)textdump_writenextblock(di, textdump_block_buffer); } /* * Commit text dump to disk. */ void textdump_dumpsys(struct dumperinfo *di) { struct kerneldumpcrypto *kdc; off_t dumplen, trailer_offset; if (di->blocksize != TEXTDUMP_BLOCKSIZE) { printf("Dump partition block size (%ju) not textdump " "block size (%ju)", (uintmax_t)di->blocksize, (uintmax_t)TEXTDUMP_BLOCKSIZE); return; } /* * We don't know a priori how large the dump will be, but we do know * that we need to reserve space for metadata and that we need two * dump headers. Also leave room for one ustar header and one block * of data. */ if (di->mediasize < SIZEOF_METADATA + 2 * sizeof(kdh)) { printf("Insufficient space on dump partition for minimal textdump.\n"); return; } textdump_error = 0; /* * Disable EKCD because we don't provide encrypted textdumps. */ kdc = di->kdcrypto; di->kdcrypto = NULL; /* * Position the start of the dump so that we'll write the kernel dump * trailer immediately before the end of the partition, and then work * our way back. We will rewrite this header later to reflect the * true size if things go well. */ textdump_offset = di->mediasize - sizeof(kdh); textdump_saveoff(&trailer_offset); dump_init_header(di, &kdh, TEXTDUMPMAGIC, KERNELDUMP_TEXT_VERSION, 0); (void)textdump_writenextblock(di, (char *)&kdh); /* * Write a series of files in ustar format. */ if (textdump_do_ddb) db_capture_dump(di); #ifdef INCLUDE_CONFIG_FILE if (textdump_do_config) textdump_dump_config(di); #endif if (textdump_do_msgbuf) textdump_dump_msgbuf(di); if (textdump_do_panic && KERNEL_PANICKED()) textdump_dump_panic(di); if (textdump_do_version) textdump_dump_version(di); /* * Now that we know the true size, we can write out the header, then * seek back to the end and rewrite the trailer with the correct * size. */ dumplen = trailer_offset - (textdump_offset + TEXTDUMP_BLOCKSIZE); dump_init_header(di, &kdh, TEXTDUMPMAGIC, KERNELDUMP_TEXT_VERSION, dumplen); (void)textdump_writenextblock(di, (char *)&kdh); textdump_restoreoff(trailer_offset); (void)textdump_writenextblock(di, (char *)&kdh); /* * Terminate the dump, report any errors, and clear the pending flag. */ if (textdump_error == 0) - (void)dump_write(di, NULL, 0, 0, 0); + (void)dump_write(di, NULL, 0, 0); if (textdump_error == ENOSPC) printf("Textdump: Insufficient space on dump partition\n"); else if (textdump_error != 0) printf("Textdump: Error %d writing dump\n", textdump_error); else printf("Textdump complete.\n"); textdump_pending = 0; /* * Restore EKCD status. */ di->kdcrypto = kdc; } /*- * DDB(4) command to manage textdumps: * * textdump set - request a textdump * textdump status - print DDB output textdump status * textdump unset - clear textdump request */ static void db_textdump_usage(void) { db_printf("textdump [unset|set|status|dump]\n"); } void db_textdump_cmd(db_expr_t addr, bool have_addr, db_expr_t count, char *modif) { int t; t = db_read_token(); if (t != tIDENT) { db_textdump_usage(); return; } if (db_read_token() != tEOL) { db_textdump_usage(); return; } if (strcmp(db_tok_string, "set") == 0) { textdump_pending = 1; db_printf("textdump set\n"); } else if (strcmp(db_tok_string, "status") == 0) { if (textdump_pending) db_printf("textdump is set\n"); else db_printf("textdump is not set\n"); } else if (strcmp(db_tok_string, "unset") == 0) { textdump_pending = 0; db_printf("textdump unset\n"); } else if (strcmp(db_tok_string, "dump") == 0) { textdump_pending = 1; doadump(true); } else { db_textdump_usage(); } } diff --git a/sys/geom/raid/g_raid.c b/sys/geom/raid/g_raid.c index 561ac5b45e42..ffcc880d74d4 100644 --- a/sys/geom/raid/g_raid.c +++ b/sys/geom/raid/g_raid.c @@ -1,2574 +1,2572 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_raid_md_if.h" #include "g_raid_tr_if.h" static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_RAID stuff"); int g_raid_enable = 1; SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN, &g_raid_enable, 0, "Enable on-disk metadata taste"); u_int g_raid_aggressive_spare = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN, &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); u_int g_raid_debug = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0, "Debug level"); int g_raid_read_err_thresh = 10; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN, &g_raid_read_err_thresh, 0, "Number of read errors equated to disk failure"); u_int g_raid_start_timeout = 30; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN, &g_raid_start_timeout, 0, "Time to wait for all array components"); static u_int g_raid_clean_time = 5; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN, &g_raid_clean_time, 0, "Mark volume as clean when idling"); static u_int g_raid_disconnect_on_failure = 1; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN, &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); static u_int g_raid_name_format = 0; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN, &g_raid_name_format, 0, "Providers name format."); static u_int g_raid_idle_threshold = 1000000; SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN, &g_raid_idle_threshold, 1000000, "Time in microseconds to consider a volume idle."); #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ } while (0) LIST_HEAD(, g_raid_md_class) g_raid_md_classes = LIST_HEAD_INITIALIZER(g_raid_md_classes); LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = LIST_HEAD_INITIALIZER(g_raid_tr_classes); LIST_HEAD(, g_raid_volume) g_raid_volumes = LIST_HEAD_INITIALIZER(g_raid_volumes); static eventhandler_tag g_raid_post_sync = NULL; static int g_raid_started = 0; static int g_raid_shutdown = 0; static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp); static g_taste_t g_raid_taste; static void g_raid_init(struct g_class *mp); static void g_raid_fini(struct g_class *mp); struct g_class g_raid_class = { .name = G_RAID_CLASS_NAME, .version = G_VERSION, .ctlreq = g_raid_ctl, .taste = g_raid_taste, .destroy_geom = g_raid_destroy_geom, .init = g_raid_init, .fini = g_raid_fini }; static void g_raid_destroy_provider(struct g_raid_volume *vol); static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); static int g_raid_update_node(struct g_raid_softc *sc, u_int event); static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); static void g_raid_start(struct bio *bp); static void g_raid_start_request(struct bio *bp); static void g_raid_disk_done(struct bio *bp); static void g_raid_poll(struct g_raid_softc *sc); static const char * g_raid_node_event2str(int event) { switch (event) { case G_RAID_NODE_E_WAKE: return ("WAKE"); case G_RAID_NODE_E_START: return ("START"); default: return ("INVALID"); } } const char * g_raid_disk_state2str(int state) { switch (state) { case G_RAID_DISK_S_NONE: return ("NONE"); case G_RAID_DISK_S_OFFLINE: return ("OFFLINE"); case G_RAID_DISK_S_DISABLED: return ("DISABLED"); case G_RAID_DISK_S_FAILED: return ("FAILED"); case G_RAID_DISK_S_STALE_FAILED: return ("STALE_FAILED"); case G_RAID_DISK_S_SPARE: return ("SPARE"); case G_RAID_DISK_S_STALE: return ("STALE"); case G_RAID_DISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_disk_event2str(int event) { switch (event) { case G_RAID_DISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_subdisk_state2str(int state) { switch (state) { case G_RAID_SUBDISK_S_NONE: return ("NONE"); case G_RAID_SUBDISK_S_FAILED: return ("FAILED"); case G_RAID_SUBDISK_S_NEW: return ("NEW"); case G_RAID_SUBDISK_S_REBUILD: return ("REBUILD"); case G_RAID_SUBDISK_S_UNINITIALIZED: return ("UNINITIALIZED"); case G_RAID_SUBDISK_S_STALE: return ("STALE"); case G_RAID_SUBDISK_S_RESYNC: return ("RESYNC"); case G_RAID_SUBDISK_S_ACTIVE: return ("ACTIVE"); default: return ("INVALID"); } } static const char * g_raid_subdisk_event2str(int event) { switch (event) { case G_RAID_SUBDISK_E_NEW: return ("NEW"); case G_RAID_SUBDISK_E_FAILED: return ("FAILED"); case G_RAID_SUBDISK_E_DISCONNECTED: return ("DISCONNECTED"); default: return ("INVALID"); } } const char * g_raid_volume_state2str(int state) { switch (state) { case G_RAID_VOLUME_S_STARTING: return ("STARTING"); case G_RAID_VOLUME_S_BROKEN: return ("BROKEN"); case G_RAID_VOLUME_S_DEGRADED: return ("DEGRADED"); case G_RAID_VOLUME_S_SUBOPTIMAL: return ("SUBOPTIMAL"); case G_RAID_VOLUME_S_OPTIMAL: return ("OPTIMAL"); case G_RAID_VOLUME_S_UNSUPPORTED: return ("UNSUPPORTED"); case G_RAID_VOLUME_S_STOPPED: return ("STOPPED"); default: return ("INVALID"); } } static const char * g_raid_volume_event2str(int event) { switch (event) { case G_RAID_VOLUME_E_UP: return ("UP"); case G_RAID_VOLUME_E_DOWN: return ("DOWN"); case G_RAID_VOLUME_E_START: return ("START"); case G_RAID_VOLUME_E_STARTMD: return ("STARTMD"); default: return ("INVALID"); } } const char * g_raid_volume_level2str(int level, int qual) { switch (level) { case G_RAID_VOLUME_RL_RAID0: return ("RAID0"); case G_RAID_VOLUME_RL_RAID1: return ("RAID1"); case G_RAID_VOLUME_RL_RAID3: if (qual == G_RAID_VOLUME_RLQ_R3P0) return ("RAID3-P0"); if (qual == G_RAID_VOLUME_RLQ_R3PN) return ("RAID3-PN"); return ("RAID3"); case G_RAID_VOLUME_RL_RAID4: if (qual == G_RAID_VOLUME_RLQ_R4P0) return ("RAID4-P0"); if (qual == G_RAID_VOLUME_RLQ_R4PN) return ("RAID4-PN"); return ("RAID4"); case G_RAID_VOLUME_RL_RAID5: if (qual == G_RAID_VOLUME_RLQ_R5RA) return ("RAID5-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RS) return ("RAID5-RS"); if (qual == G_RAID_VOLUME_RLQ_R5LA) return ("RAID5-LA"); if (qual == G_RAID_VOLUME_RLQ_R5LS) return ("RAID5-LS"); return ("RAID5"); case G_RAID_VOLUME_RL_RAID6: if (qual == G_RAID_VOLUME_RLQ_R6RA) return ("RAID6-RA"); if (qual == G_RAID_VOLUME_RLQ_R6RS) return ("RAID6-RS"); if (qual == G_RAID_VOLUME_RLQ_R6LA) return ("RAID6-LA"); if (qual == G_RAID_VOLUME_RLQ_R6LS) return ("RAID6-LS"); return ("RAID6"); case G_RAID_VOLUME_RL_RAIDMDF: if (qual == G_RAID_VOLUME_RLQ_RMDFRA) return ("RAIDMDF-RA"); if (qual == G_RAID_VOLUME_RLQ_RMDFRS) return ("RAIDMDF-RS"); if (qual == G_RAID_VOLUME_RLQ_RMDFLA) return ("RAIDMDF-LA"); if (qual == G_RAID_VOLUME_RLQ_RMDFLS) return ("RAIDMDF-LS"); return ("RAIDMDF"); case G_RAID_VOLUME_RL_RAID1E: if (qual == G_RAID_VOLUME_RLQ_R1EA) return ("RAID1E-A"); if (qual == G_RAID_VOLUME_RLQ_R1EO) return ("RAID1E-O"); return ("RAID1E"); case G_RAID_VOLUME_RL_SINGLE: return ("SINGLE"); case G_RAID_VOLUME_RL_CONCAT: return ("CONCAT"); case G_RAID_VOLUME_RL_RAID5E: if (qual == G_RAID_VOLUME_RLQ_R5ERA) return ("RAID5E-RA"); if (qual == G_RAID_VOLUME_RLQ_R5ERS) return ("RAID5E-RS"); if (qual == G_RAID_VOLUME_RLQ_R5ELA) return ("RAID5E-LA"); if (qual == G_RAID_VOLUME_RLQ_R5ELS) return ("RAID5E-LS"); return ("RAID5E"); case G_RAID_VOLUME_RL_RAID5EE: if (qual == G_RAID_VOLUME_RLQ_R5EERA) return ("RAID5EE-RA"); if (qual == G_RAID_VOLUME_RLQ_R5EERS) return ("RAID5EE-RS"); if (qual == G_RAID_VOLUME_RLQ_R5EELA) return ("RAID5EE-LA"); if (qual == G_RAID_VOLUME_RLQ_R5EELS) return ("RAID5EE-LS"); return ("RAID5EE"); case G_RAID_VOLUME_RL_RAID5R: if (qual == G_RAID_VOLUME_RLQ_R5RRA) return ("RAID5R-RA"); if (qual == G_RAID_VOLUME_RLQ_R5RRS) return ("RAID5R-RS"); if (qual == G_RAID_VOLUME_RLQ_R5RLA) return ("RAID5R-LA"); if (qual == G_RAID_VOLUME_RLQ_R5RLS) return ("RAID5R-LS"); return ("RAID5E"); default: return ("UNKNOWN"); } } int g_raid_volume_str2level(const char *str, int *level, int *qual) { *level = G_RAID_VOLUME_RL_UNKNOWN; *qual = G_RAID_VOLUME_RLQ_NONE; if (strcasecmp(str, "RAID0") == 0) *level = G_RAID_VOLUME_RL_RAID0; else if (strcasecmp(str, "RAID1") == 0) *level = G_RAID_VOLUME_RL_RAID1; else if (strcasecmp(str, "RAID3-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3P0; } else if (strcasecmp(str, "RAID3-PN") == 0 || strcasecmp(str, "RAID3") == 0) { *level = G_RAID_VOLUME_RL_RAID3; *qual = G_RAID_VOLUME_RLQ_R3PN; } else if (strcasecmp(str, "RAID4-P0") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4P0; } else if (strcasecmp(str, "RAID4-PN") == 0 || strcasecmp(str, "RAID4") == 0) { *level = G_RAID_VOLUME_RL_RAID4; *qual = G_RAID_VOLUME_RLQ_R4PN; } else if (strcasecmp(str, "RAID5-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RA; } else if (strcasecmp(str, "RAID5-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5RS; } else if (strcasecmp(str, "RAID5") == 0 || strcasecmp(str, "RAID5-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LA; } else if (strcasecmp(str, "RAID5-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5; *qual = G_RAID_VOLUME_RLQ_R5LS; } else if (strcasecmp(str, "RAID6-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RA; } else if (strcasecmp(str, "RAID6-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6RS; } else if (strcasecmp(str, "RAID6") == 0 || strcasecmp(str, "RAID6-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LA; } else if (strcasecmp(str, "RAID6-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID6; *qual = G_RAID_VOLUME_RLQ_R6LS; } else if (strcasecmp(str, "RAIDMDF-RA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRA; } else if (strcasecmp(str, "RAIDMDF-RS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFRS; } else if (strcasecmp(str, "RAIDMDF") == 0 || strcasecmp(str, "RAIDMDF-LA") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLA; } else if (strcasecmp(str, "RAIDMDF-LS") == 0) { *level = G_RAID_VOLUME_RL_RAIDMDF; *qual = G_RAID_VOLUME_RLQ_RMDFLS; } else if (strcasecmp(str, "RAID10") == 0 || strcasecmp(str, "RAID1E") == 0 || strcasecmp(str, "RAID1E-A") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EA; } else if (strcasecmp(str, "RAID1E-O") == 0) { *level = G_RAID_VOLUME_RL_RAID1E; *qual = G_RAID_VOLUME_RLQ_R1EO; } else if (strcasecmp(str, "SINGLE") == 0) *level = G_RAID_VOLUME_RL_SINGLE; else if (strcasecmp(str, "CONCAT") == 0) *level = G_RAID_VOLUME_RL_CONCAT; else if (strcasecmp(str, "RAID5E-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERA; } else if (strcasecmp(str, "RAID5E-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ERS; } else if (strcasecmp(str, "RAID5E") == 0 || strcasecmp(str, "RAID5E-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELA; } else if (strcasecmp(str, "RAID5E-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5E; *qual = G_RAID_VOLUME_RLQ_R5ELS; } else if (strcasecmp(str, "RAID5EE-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERA; } else if (strcasecmp(str, "RAID5EE-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EERS; } else if (strcasecmp(str, "RAID5EE") == 0 || strcasecmp(str, "RAID5EE-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELA; } else if (strcasecmp(str, "RAID5EE-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5EE; *qual = G_RAID_VOLUME_RLQ_R5EELS; } else if (strcasecmp(str, "RAID5R-RA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRA; } else if (strcasecmp(str, "RAID5R-RS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RRS; } else if (strcasecmp(str, "RAID5R") == 0 || strcasecmp(str, "RAID5R-LA") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLA; } else if (strcasecmp(str, "RAID5R-LS") == 0) { *level = G_RAID_VOLUME_RL_RAID5R; *qual = G_RAID_VOLUME_RLQ_R5RLS; } else return (-1); return (0); } const char * g_raid_get_diskname(struct g_raid_disk *disk) { if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) return ("[unknown]"); return (disk->d_consumer->provider->name); } void g_raid_get_disk_info(struct g_raid_disk *disk) { struct g_consumer *cp = disk->d_consumer; int error, len; /* Read kernel dumping information. */ disk->d_kd.offset = 0; disk->d_kd.length = OFF_MAX; len = sizeof(disk->d_kd); error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd); if (error) disk->d_kd.di.dumper = NULL; if (disk->d_kd.di.dumper == NULL) G_RAID_DEBUG1(2, disk->d_softc, "Dumping not supported by %s: %d.", cp->provider->name, error); /* Read BIO_DELETE support. */ error = g_getattr("GEOM::candelete", cp, &disk->d_candelete); if (error) disk->d_candelete = 0; if (!disk->d_candelete) G_RAID_DEBUG1(2, disk->d_softc, "BIO_DELETE not supported by %s: %d.", cp->provider->name, error); } void g_raid_report_disk_state(struct g_raid_disk *disk) { struct g_raid_subdisk *sd; int len, state; uint32_t s; if (disk->d_consumer == NULL) return; if (disk->d_state == G_RAID_DISK_S_DISABLED) { s = G_STATE_ACTIVE; /* XXX */ } else if (disk->d_state == G_RAID_DISK_S_FAILED || disk->d_state == G_RAID_DISK_S_STALE_FAILED) { s = G_STATE_FAILED; } else { state = G_RAID_SUBDISK_S_ACTIVE; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { if (sd->sd_state < state) state = sd->sd_state; } if (state == G_RAID_SUBDISK_S_FAILED) s = G_STATE_FAILED; else if (state == G_RAID_SUBDISK_S_NEW || state == G_RAID_SUBDISK_S_REBUILD) s = G_STATE_REBUILD; else if (state == G_RAID_SUBDISK_S_STALE || state == G_RAID_SUBDISK_S_RESYNC) s = G_STATE_RESYNC; else s = G_STATE_ACTIVE; } len = sizeof(s); g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", g_raid_get_diskname(disk), s); } void g_raid_change_disk_state(struct g_raid_disk *disk, int state) { G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", g_raid_get_diskname(disk), g_raid_disk_state2str(disk->d_state), g_raid_disk_state2str(state)); disk->d_state = state; g_raid_report_disk_state(disk); } void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) { G_RAID_DEBUG1(0, sd->sd_softc, "Subdisk %s:%d-%s state changed from %s to %s.", sd->sd_volume->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", g_raid_subdisk_state2str(sd->sd_state), g_raid_subdisk_state2str(state)); sd->sd_state = state; if (sd->sd_disk) g_raid_report_disk_state(sd->sd_disk); } void g_raid_change_volume_state(struct g_raid_volume *vol, int state) { G_RAID_DEBUG1(0, vol->v_softc, "Volume %s state changed from %s to %s.", vol->v_name, g_raid_volume_state2str(vol->v_state), g_raid_volume_state2str(state)); vol->v_state = state; } /* * --- Events handling functions --- * Events in geom_raid are used to maintain subdisks and volumes status * from one thread to simplify locking. */ static void g_raid_event_free(struct g_raid_event *ep) { free(ep, M_RAID); } int g_raid_event_send(void *arg, int event, int flags) { struct g_raid_softc *sc; struct g_raid_event *ep; int error; if ((flags & G_RAID_EVENT_VOLUME) != 0) { sc = ((struct g_raid_volume *)arg)->v_softc; } else if ((flags & G_RAID_EVENT_DISK) != 0) { sc = ((struct g_raid_disk *)arg)->d_softc; } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { sc = ((struct g_raid_subdisk *)arg)->sd_softc; } else { sc = arg; } ep = malloc(sizeof(*ep), M_RAID, sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); if (ep == NULL) return (ENOMEM); ep->e_tgt = arg; ep->e_event = event; ep->e_flags = flags; ep->e_error = 0; G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); mtx_lock(&sc->sc_queue_mtx); TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); wakeup(sc); if ((flags & G_RAID_EVENT_WAIT) == 0) return (0); sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); sx_xunlock(&sc->sc_lock); while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { mtx_lock(&sc->sc_queue_mtx); MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", hz * 5); } error = ep->e_error; g_raid_event_free(ep); sx_xlock(&sc->sc_lock); return (error); } static void g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep, *tmpep; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { if (ep->e_tgt != tgt) continue; TAILQ_REMOVE(&sc->sc_events, ep, e_next); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) g_raid_event_free(ep); else { ep->e_error = ECANCELED; wakeup(ep); } } mtx_unlock(&sc->sc_queue_mtx); } static int g_raid_event_check(struct g_raid_softc *sc, void *tgt) { struct g_raid_event *ep; int res = 0; sx_assert(&sc->sc_lock, SX_XLOCKED); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(ep, &sc->sc_events, e_next) { if (ep->e_tgt != tgt) continue; res = 1; break; } mtx_unlock(&sc->sc_queue_mtx); return (res); } /* * Return the number of disks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_ndisks(struct g_raid_softc *sc, int state) { struct g_raid_disk *disk; u_int n; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == state || state == -1) n++; } return (n); } /* * Return the number of subdisks in given state. * If state is equal to -1, count all connected disks. */ u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *subdisk; struct g_raid_softc *sc __diagused; u_int i, n ; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); n = 0; for (i = 0; i < vol->v_disks_count; i++) { subdisk = &vol->v_subdisks[i]; if ((state == -1 && subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || subdisk->sd_state == state) n++; } return (n); } /* * Return the first subdisk in given state. * If state is equal to -1, then the first connected disks. */ struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, int state) { struct g_raid_subdisk *sd; struct g_raid_softc *sc __diagused; u_int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((state == -1 && sd->sd_state != G_RAID_SUBDISK_S_NONE) || sd->sd_state == state) return (sd); } return (NULL); } struct g_consumer * g_raid_open_consumer(struct g_raid_softc *sc, const char *name) { struct g_consumer *cp; struct g_provider *pp; g_topology_assert(); if (strncmp(name, _PATH_DEV, 5) == 0) name += 5; pp = g_provider_by_name(name); if (pp == NULL) return (NULL); cp = g_new_consumer(sc->sc_geom); cp->flags |= G_CF_DIRECT_RECEIVE; if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 1, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } return (cp); } static u_int g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) { struct bio *bp; u_int nreqs = 0; mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { if (bp->bio_from == cp) nreqs++; } mtx_unlock(&sc->sc_queue_mtx); return (nreqs); } u_int g_raid_nopens(struct g_raid_softc *sc) { struct g_raid_volume *vol; u_int opens; opens = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_provider_open != 0) opens++; } return (opens); } static int g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) { if (cp->index > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s exist, can't destroy it now.", cp->provider->name); return (1); } if (g_raid_nrequests(sc, cp) > 0) { G_RAID_DEBUG1(2, sc, "I/O requests for %s in queue, can't destroy it now.", cp->provider->name); return (1); } return (0); } static void g_raid_destroy_consumer(void *arg, int flags __unused) { struct g_consumer *cp; g_topology_assert(); cp = arg; G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); g_detach(cp); g_destroy_consumer(cp); } void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) { struct g_provider *pp; int retaste_wait; g_topology_assert_not(); g_topology_lock(); cp->private = NULL; if (g_raid_consumer_is_busy(sc, cp)) goto out; pp = cp->provider; retaste_wait = 0; if (cp->acw == 1) { if ((pp->geom->flags & G_GEOM_WITHER) == 0) retaste_wait = 1; } if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); if (retaste_wait) { /* * After retaste event was send (inside g_access()), we can send * event to detach and destroy consumer. * A class, which has consumer to the given provider connected * will not receive retaste event for the provider. * This is the way how I ignore retaste events when I close * consumers opened for write: I detach and destroy consumer * after retaste event is sent. */ g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); goto out; } G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); g_detach(cp); g_destroy_consumer(cp); out: g_topology_unlock(); } static void g_raid_orphan(struct g_consumer *cp) { struct g_raid_disk *disk; g_topology_assert(); disk = cp->private; if (disk == NULL) return; g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, G_RAID_EVENT_DISK); } static void g_raid_clean(struct g_raid_volume *vol, int acw) { struct g_raid_softc *sc; int timeout; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; if (!vol->v_dirty) return; if (vol->v_writes > 0) return; if (acw > 0 || (acw == -1 && vol->v_provider != NULL && vol->v_provider->acw > 0)) { timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); if (!g_raid_shutdown && timeout > 0) return; } vol->v_dirty = 0; G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } static void g_raid_dirty(struct g_raid_volume *vol) { struct g_raid_softc *sc; sc = vol->v_softc; g_topology_assert_not(); sx_assert(&sc->sc_lock, SX_XLOCKED); // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) // return; vol->v_dirty = 1; G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", vol->v_name); g_raid_write_metadata(sc, vol, NULL, NULL); } void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct bio_queue_head queue; struct bio *cbp; int i; vol = tr->tro_volume; /* * Allocate all bios before sending any request, so we can return * ENOMEM in nice and clean way. */ bioq_init(&queue); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) continue; cbp = g_clone_bio(bp); if (cbp == NULL) goto failure; cbp->bio_caller1 = sd; bioq_insert_tail(&queue, cbp); } while ((cbp = bioq_takefirst(&queue)) != NULL) { sd = cbp->bio_caller1; cbp->bio_caller1 = NULL; g_raid_subdisk_iostart(sd, cbp); } return; failure: while ((cbp = bioq_takefirst(&queue)) != NULL) g_destroy_bio(cbp); if (bp->bio_error == 0) bp->bio_error = ENOMEM; g_raid_iodone(bp, bp->bio_error); } static void g_raid_tr_kerneldump_common_done(struct bio *bp) { bp->bio_flags |= BIO_DONE; } int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct bio bp; vol = tr->tro_volume; sc = vol->v_softc; g_reset_bio(&bp); bp.bio_cmd = BIO_WRITE; bp.bio_done = g_raid_tr_kerneldump_common_done; bp.bio_attribute = NULL; bp.bio_offset = offset; bp.bio_length = length; bp.bio_data = virtual; bp.bio_to = vol->v_provider; g_raid_start(&bp); while (!(bp.bio_flags & BIO_DONE)) { G_RAID_DEBUG1(4, sc, "Poll..."); g_raid_poll(sc); DELAY(10); } return (bp.bio_error != 0 ? EIO : 0); } static int g_raid_dump(void *arg, void *virtual, off_t offset, size_t length) { struct g_raid_volume *vol; int error; vol = (struct g_raid_volume *)arg; G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", (long long unsigned)offset, (long long unsigned)length); error = G_RAID_TR_KERNELDUMP(vol->v_tr, virtual, offset, length); return (error); } static void g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) { struct g_kerneldump *gkd; struct g_provider *pp; struct g_raid_volume *vol; gkd = (struct g_kerneldump*)bp->bio_data; pp = bp->bio_to; vol = pp->private; g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); gkd->di.dumper = g_raid_dump; gkd->di.priv = vol; gkd->di.blocksize = vol->v_sectorsize; gkd->di.maxiosize = DFLTPHYS; gkd->di.mediaoffset = gkd->offset; if ((gkd->offset + gkd->length) > vol->v_mediasize) gkd->length = vol->v_mediasize - gkd->offset; gkd->di.mediasize = gkd->length; g_io_deliver(bp, 0); } static void g_raid_candelete(struct g_raid_softc *sc, struct bio *bp) { struct g_provider *pp; struct g_raid_volume *vol; struct g_raid_subdisk *sd; int i, val; pp = bp->bio_to; vol = pp->private; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if (sd->sd_disk->d_candelete) break; } val = i < vol->v_disks_count; g_handleattr(bp, "GEOM::candelete", &val, sizeof(val)); } static void g_raid_start(struct bio *bp) { struct g_raid_softc *sc; sc = bp->bio_to->geom->softc; /* * If sc == NULL or there are no valid disks, provider's error * should be set and g_raid_start() should not be called at all. */ // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, // ("Provider's error should be set (error=%d)(mirror=%s).", // bp->bio_to->error, bp->bio_to->name)); G_RAID_LOGREQ(3, bp, "Request received."); switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: case BIO_DELETE: case BIO_FLUSH: case BIO_SPEEDUP: break; case BIO_GETATTR: if (!strcmp(bp->bio_attribute, "GEOM::candelete")) g_raid_candelete(sc, bp); else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) g_raid_kerneldump(sc, bp); else g_io_deliver(bp, EOPNOTSUPP); return; default: g_io_deliver(bp, EOPNOTSUPP); return; } mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) { G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); wakeup(sc); } } static int g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) { /* * 5 cases: * (1) bp entirely below NO * (2) bp entirely above NO * (3) bp start below, but end in range YES * (4) bp entirely within YES * (5) bp starts within, ends above YES * * lock range 10-19 (offset 10 length 10) * (1) 1-5: first if kicks it out * (2) 30-35: second if kicks it out * (3) 5-15: passes both ifs * (4) 12-14: passes both ifs * (5) 19-20: passes both */ off_t lend = lstart + len - 1; off_t bstart = bp->bio_offset; off_t bend = bp->bio_offset + bp->bio_length - 1; if (bend < lstart) return (0); if (lend < bstart) return (0); return (1); } static int g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) { struct g_raid_lock *lp; sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); LIST_FOREACH(lp, &vol->v_locks, l_next) { if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) return (1); } return (0); } static void g_raid_start_request(struct bio *bp) { struct g_raid_softc *sc __diagused; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; /* * Check to see if this item is in a locked range. If so, * queue it to our locked queue and return. We'll requeue * it when the range is unlocked. Internal I/O for the * rebuild/rescan/recovery process is excluded from this * check so we can actually do the recovery. */ if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && g_raid_is_in_locked_range(vol, bp)) { G_RAID_LOGREQ(3, bp, "Defer request."); bioq_insert_tail(&vol->v_locked, bp); return; } /* * If we're actually going to do the write/delete, then * update the idle stats for the volume. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { if (!vol->v_dirty) g_raid_dirty(vol); vol->v_writes++; } /* * Put request onto inflight queue, so we can check if new * synchronization requests don't collide with it. Then tell * the transformation layer to start the I/O. */ bioq_insert_tail(&vol->v_inflight, bp); G_RAID_LOGREQ(4, bp, "Request started"); G_RAID_TR_IOSTART(vol->v_tr, bp); } static void g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) { off_t off, len; struct bio *nbp; struct g_raid_lock *lp; vol->v_pending_lock = 0; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_pending) { off = lp->l_offset; len = lp->l_length; lp->l_pending = 0; TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { if (g_raid_bio_overlaps(nbp, off, len)) lp->l_pending++; } if (lp->l_pending) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock(%jd, %jd) has %d pending", (intmax_t)off, (intmax_t)(off + len), lp->l_pending); continue; } G_RAID_DEBUG1(4, vol->v_softc, "Deferred lock of %jd to %jd completed", (intmax_t)off, (intmax_t)(off + len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); } } } void g_raid_iodone(struct bio *bp, int error) { struct g_raid_softc *sc __diagused; struct g_raid_volume *vol; sc = bp->bio_to->geom->softc; sx_assert(&sc->sc_lock, SX_LOCKED); vol = bp->bio_to->private; G_RAID_LOGREQ(3, bp, "Request done: %d.", error); /* Update stats if we done write/delete. */ if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { vol->v_writes--; vol->v_last_write = time_uptime; } bioq_remove(&vol->v_inflight, bp); if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) g_raid_finish_with_locked_ranges(vol, bp); getmicrouptime(&vol->v_last_done); g_io_deliver(bp, error); } int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, struct bio *ignore, void *argp) { struct g_raid_softc *sc; struct g_raid_lock *lp; struct bio *bp; sc = vol->v_softc; lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); lp->l_offset = off; lp->l_length = len; lp->l_callback_arg = argp; lp->l_pending = 0; TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) lp->l_pending++; } /* * If there are any writes that are pending, we return EBUSY. All * callers will have to wait until all pending writes clear. */ if (lp->l_pending > 0) { vol->v_pending_lock = 1; G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", (intmax_t)off, (intmax_t)(off+len), lp->l_pending); return (EBUSY); } G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", (intmax_t)off, (intmax_t)(off+len)); G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); return (0); } int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) { struct g_raid_lock *lp; struct g_raid_softc *sc; struct bio *bp; sc = vol->v_softc; LIST_FOREACH(lp, &vol->v_locks, l_next) { if (lp->l_offset == off && lp->l_length == len) { LIST_REMOVE(lp, l_next); /* XXX * Right now we just put them all back on the queue * and hope for the best. We hope this because any * locked ranges will go right back on this list * when the worker thread runs. * XXX */ G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", (intmax_t)lp->l_offset, (intmax_t)(lp->l_offset+lp->l_length)); mtx_lock(&sc->sc_queue_mtx); while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); free(lp, M_RAID); return (0); } } return (EINVAL); } void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) { struct g_consumer *cp; struct g_raid_disk *disk, *tdisk; bp->bio_caller1 = sd; /* * Make sure that the disk is present. Generally it is a task of * transformation layers to not send requests to absent disks, but * it is better to be safe and report situation then sorry. */ if (sd->sd_disk == NULL) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); nodisk: bp->bio_from = NULL; bp->bio_to = NULL; bp->bio_error = ENXIO; g_raid_disk_done(bp); return; } disk = sd->sd_disk; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_FAILED) { G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); goto nodisk; } cp = disk->d_consumer; bp->bio_from = cp; bp->bio_to = cp->provider; cp->index++; /* Update average disks load. */ TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { if (tdisk->d_consumer == NULL) tdisk->d_load = 0; else tdisk->d_load = (tdisk->d_consumer->index * G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; } disk->d_last_offset = bp->bio_offset + bp->bio_length; if (dumping) { G_RAID_LOGREQ(3, bp, "Sending dumping request."); if (bp->bio_cmd == BIO_WRITE) { bp->bio_error = g_raid_subdisk_kerneldump(sd, bp->bio_data, bp->bio_offset, bp->bio_length); } else bp->bio_error = EOPNOTSUPP; g_raid_disk_done(bp); } else { bp->bio_done = g_raid_disk_done; bp->bio_offset += sd->sd_offset; G_RAID_LOGREQ(3, bp, "Sending request."); g_io_request(bp, cp); } } int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual, off_t offset, size_t length) { if (sd->sd_disk == NULL) return (ENXIO); if (sd->sd_disk->d_kd.di.dumper == NULL) return (EOPNOTSUPP); - return (dump_write(&sd->sd_disk->d_kd.di, - virtual, 0, - sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, - length)); + return (dump_write(&sd->sd_disk->d_kd.di, virtual, + sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, length)); } static void g_raid_disk_done(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; sd = bp->bio_caller1; sc = sd->sd_softc; mtx_lock(&sc->sc_queue_mtx); bioq_insert_tail(&sc->sc_queue, bp); mtx_unlock(&sc->sc_queue_mtx); if (!dumping) wakeup(sc); } static void g_raid_disk_done_request(struct bio *bp) { struct g_raid_softc *sc; struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_volume *vol; g_topology_assert_not(); G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); sd = bp->bio_caller1; sc = sd->sd_softc; vol = sd->sd_volume; if (bp->bio_from != NULL) { bp->bio_from->index--; disk = bp->bio_from->private; if (disk == NULL) g_raid_kill_consumer(sc, bp->bio_from); } bp->bio_offset -= sd->sd_offset; G_RAID_TR_IODONE(vol->v_tr, sd, bp); } static void g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) { if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); else ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { KASSERT(ep->e_error == 0, ("Error cannot be handled.")); g_raid_event_free(ep); } else { ep->e_flags |= G_RAID_EVENT_DONE; G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); mtx_lock(&sc->sc_queue_mtx); wakeup(ep); mtx_unlock(&sc->sc_queue_mtx); } } /* * Worker thread. */ static void g_raid_worker(void *arg) { struct g_raid_softc *sc; struct g_raid_event *ep; struct g_raid_volume *vol; struct bio *bp; struct timeval now, t; int timeout, rv; sc = arg; thread_lock(curthread); sched_prio(curthread, PRIBIO); thread_unlock(curthread); sx_xlock(&sc->sc_lock); for (;;) { mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ bp = NULL; vol = NULL; rv = 0; ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) TAILQ_REMOVE(&sc->sc_events, ep, e_next); else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) ; else { getmicrouptime(&now); t = now; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr && timevalcmp(&vol->v_last_done, &t, < )) t = vol->v_last_done; } timevalsub(&t, &now); timeout = g_raid_idle_threshold + t.tv_sec * 1000000 + t.tv_usec; if (timeout > 0) { /* * Two steps to avoid overflows at HZ=1000 * and idle timeouts > 2.1s. Some rounding * errors can occur, but they are < 1tick, * which is deemed to be close enough for * this purpose. */ int micpertic = 1000000 / hz; timeout = (timeout + micpertic - 1) / micpertic; sx_xunlock(&sc->sc_lock); MSLEEP(rv, sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "-", timeout); sx_xlock(&sc->sc_lock); goto process; } else rv = EWOULDBLOCK; } mtx_unlock(&sc->sc_queue_mtx); process: if (ep != NULL) { g_raid_handle_event(sc, ep); } else if (bp != NULL) { if (bp->bio_to != NULL && bp->bio_to->geom == sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } else if (rv == EWOULDBLOCK) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_clean(vol, -1); if (bioq_first(&vol->v_inflight) == NULL && vol->v_tr) { t.tv_sec = g_raid_idle_threshold / 1000000; t.tv_usec = g_raid_idle_threshold % 1000000; timevaladd(&t, &vol->v_last_done); getmicrouptime(&now); if (timevalcmp(&t, &now, <= )) { G_RAID_TR_IDLE(vol->v_tr); vol->v_last_done = now; } } } } if (sc->sc_stopping == G_RAID_DESTROY_HARD) g_raid_destroy_node(sc, 1); /* May not return. */ } } static void g_raid_poll(struct g_raid_softc *sc) { struct g_raid_event *ep; struct bio *bp; sx_xlock(&sc->sc_lock); mtx_lock(&sc->sc_queue_mtx); /* * First take a look at events. * This is important to handle events before any I/O requests. */ ep = TAILQ_FIRST(&sc->sc_events); if (ep != NULL) { TAILQ_REMOVE(&sc->sc_events, ep, e_next); mtx_unlock(&sc->sc_queue_mtx); g_raid_handle_event(sc, ep); goto out; } bp = bioq_takefirst(&sc->sc_queue); if (bp != NULL) { mtx_unlock(&sc->sc_queue_mtx); if (bp->bio_from == NULL || bp->bio_from->geom != sc->sc_geom) g_raid_start_request(bp); else g_raid_disk_done_request(bp); } out: sx_xunlock(&sc->sc_lock); } static void g_raid_launch_provider(struct g_raid_volume *vol) { struct g_raid_disk *disk; struct g_raid_subdisk *sd; struct g_raid_softc *sc; struct g_provider *pp; char name[G_RAID_MAX_VOLUMENAME]; off_t off; int i; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_LOCKED); g_topology_lock(); /* Try to name provider with volume name. */ snprintf(name, sizeof(name), "raid/%s", vol->v_name); if (g_raid_name_format == 0 || vol->v_name[0] == 0 || g_provider_by_name(name) != NULL) { /* Otherwise use sequential volume number. */ snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); } pp = g_new_providerf(sc->sc_geom, "%s", name); pp->flags |= G_PF_DIRECT_RECEIVE; if (vol->v_tr->tro_class->trc_accept_unmapped) { pp->flags |= G_PF_ACCEPT_UNMAPPED; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) continue; if ((sd->sd_disk->d_consumer->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0) pp->flags &= ~G_PF_ACCEPT_UNMAPPED; } } pp->private = vol; pp->mediasize = vol->v_mediasize; pp->sectorsize = vol->v_sectorsize; pp->stripesize = 0; pp->stripeoffset = 0; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { if ((disk = vol->v_subdisks[0].sd_disk) != NULL && disk->d_consumer != NULL && disk->d_consumer->provider != NULL) { pp->stripesize = disk->d_consumer->provider->stripesize; off = disk->d_consumer->provider->stripeoffset; pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; if (off > 0) pp->stripeoffset %= off; } if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { pp->stripesize *= (vol->v_disks_count - 1); pp->stripeoffset *= (vol->v_disks_count - 1); } } else pp->stripesize = vol->v_strip_size; vol->v_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", pp->name, vol->v_name); } static void g_raid_destroy_provider(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_provider *pp; struct bio *bp, *tmp; g_topology_assert_not(); sc = vol->v_softc; pp = vol->v_provider; KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); g_topology_lock(); g_error_provider(pp, ENXIO); mtx_lock(&sc->sc_queue_mtx); TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { if (bp->bio_to != pp) continue; bioq_remove(&sc->sc_queue, bp); g_io_deliver(bp, ENXIO); } mtx_unlock(&sc->sc_queue_mtx); G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", pp->name, vol->v_name); g_wither_provider(pp, ENXIO); g_topology_unlock(); vol->v_provider = NULL; } /* * Update device state. */ static int g_raid_update_volume(struct g_raid_volume *vol, u_int event) { struct g_raid_softc *sc; sc = vol->v_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", g_raid_volume_event2str(event), vol->v_name); switch (event) { case G_RAID_VOLUME_E_DOWN: if (vol->v_provider != NULL) g_raid_destroy_provider(vol); break; case G_RAID_VOLUME_E_UP: if (vol->v_provider == NULL) g_raid_launch_provider(vol); break; case G_RAID_VOLUME_E_START: if (vol->v_tr) G_RAID_TR_START(vol->v_tr); return (0); default: if (sc->sc_md) G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); return (0); } /* Manage root mount release. */ if (vol->v_starting) { vol->v_starting = 0; G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); root_mount_rel(vol->v_rootmount); vol->v_rootmount = NULL; } if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); return (0); } /* * Update subdisk state. */ static int g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) { struct g_raid_softc *sc; struct g_raid_volume *vol; sc = sd->sd_softc; vol = sd->sd_volume; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", g_raid_subdisk_event2str(event), vol->v_name, sd->sd_pos, sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); if (vol->v_tr) G_RAID_TR_EVENT(vol->v_tr, sd, event); return (0); } /* * Update disk state. */ static int g_raid_update_disk(struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = disk->d_softc; sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", g_raid_disk_event2str(event), g_raid_get_diskname(disk)); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, disk, event); return (0); } /* * Node event. */ static int g_raid_update_node(struct g_raid_softc *sc, u_int event) { sx_assert(&sc->sc_lock, SX_XLOCKED); G_RAID_DEBUG1(2, sc, "Event %s for the array.", g_raid_node_event2str(event)); if (event == G_RAID_NODE_E_WAKE) return (0); if (sc->sc_md) G_RAID_MD_EVENT(sc->sc_md, NULL, event); return (0); } static int g_raid_access(struct g_provider *pp, int acr, int acw, int ace) { struct g_raid_volume *vol; struct g_raid_softc *sc; int dcw, opens, error = 0; g_topology_assert(); sc = pp->geom->softc; vol = pp->private; KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, acr, acw, ace); dcw = pp->acw + acw; g_topology_unlock(); sx_xlock(&sc->sc_lock); /* Deny new opens while dying. */ if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { error = ENXIO; goto out; } /* Deny write opens for read-only volumes. */ if (vol->v_read_only && acw > 0) { error = EROFS; goto out; } if (dcw == 0) g_raid_clean(vol, dcw); vol->v_provider_open += acr + acw + ace; /* Handle delayed node destruction. */ if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && vol->v_provider_open == 0) { /* Count open volumes. */ opens = g_raid_nopens(sc); if (opens == 0) { sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } } /* Handle open volume destruction. */ if (vol->v_stopping && vol->v_provider_open == 0) g_raid_destroy_volume(vol); out: sx_xunlock(&sc->sc_lock); g_topology_lock(); return (error); } struct g_raid_softc * g_raid_create_node(struct g_class *mp, const char *name, struct g_raid_md_object *md) { struct g_raid_softc *sc; struct g_geom *gp; int error; g_topology_assert(); G_RAID_DEBUG(1, "Creating array %s.", name); gp = g_new_geomf(mp, "%s", name); sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); gp->start = g_raid_start; gp->orphan = g_raid_orphan; gp->access = g_raid_access; gp->dumpconf = g_raid_dumpconf; sc->sc_md = md; sc->sc_geom = gp; sc->sc_flags = 0; TAILQ_INIT(&sc->sc_volumes); TAILQ_INIT(&sc->sc_disks); sx_init(&sc->sc_lock, "graid:lock"); mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF); TAILQ_INIT(&sc->sc_events); bioq_init(&sc->sc_queue); gp->softc = sc; error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, "g_raid %s", name); if (error != 0) { G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); mtx_destroy(&sc->sc_queue_mtx); sx_destroy(&sc->sc_lock); g_destroy_geom(sc->sc_geom); free(sc, M_RAID); return (NULL); } G_RAID_DEBUG1(0, sc, "Array %s created.", name); return (sc); } struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) { struct g_raid_volume *vol, *vol1; int i; G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); vol->v_softc = sc; strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); vol->v_state = G_RAID_VOLUME_S_STARTING; vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; vol->v_rotate_parity = 1; bioq_init(&vol->v_inflight); bioq_init(&vol->v_locked); LIST_INIT(&vol->v_locks); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { vol->v_subdisks[i].sd_softc = sc; vol->v_subdisks[i].sd_volume = vol; vol->v_subdisks[i].sd_pos = i; vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; } /* Find free ID for this volume. */ g_topology_lock(); vol1 = vol; if (id >= 0) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } } if (vol1 != NULL) { for (id = 0; ; id++) { LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { if (vol1->v_global_id == id) break; } if (vol1 == NULL) break; } } vol->v_global_id = id; LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); g_topology_unlock(); /* Delay root mounting. */ vol->v_rootmount = root_mount_hold("GRAID"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); vol->v_starting = 1; TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); return (vol); } struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc) { struct g_raid_disk *disk; G_RAID_DEBUG1(1, sc, "Creating disk."); disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); disk->d_softc = sc; disk->d_state = G_RAID_DISK_S_NONE; TAILQ_INIT(&disk->d_subdisks); TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); return (disk); } int g_raid_start_volume(struct g_raid_volume *vol) { struct g_raid_tr_class *class; struct g_raid_tr_object *obj; int status; G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { if (!class->trc_enable) continue; G_RAID_DEBUG1(2, vol->v_softc, "Tasting volume %s for %s transformation.", vol->v_name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->tro_class = class; obj->tro_volume = vol; status = G_RAID_TR_TASTE(obj, vol); if (status != G_RAID_TR_TASTE_FAIL) break; kobj_delete((kobj_t)obj, M_RAID); } if (class == NULL) { G_RAID_DEBUG1(0, vol->v_softc, "No transformation module found for %s.", vol->v_name); vol->v_tr = NULL; g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, G_RAID_EVENT_VOLUME); return (-1); } G_RAID_DEBUG1(2, vol->v_softc, "Transformation module %s chosen for %s.", class->name, vol->v_name); vol->v_tr = obj; return (0); } int g_raid_destroy_node(struct g_raid_softc *sc, int worker) { struct g_raid_volume *vol, *tmpv; struct g_raid_disk *disk, *tmpd; int error = 0; sc->sc_stopping = G_RAID_DESTROY_HARD; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { if (g_raid_destroy_volume(vol)) error = EBUSY; } if (error) return (error); TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { if (g_raid_destroy_disk(disk)) error = EBUSY; } if (error) return (error); if (sc->sc_md) { G_RAID_MD_FREE(sc->sc_md); kobj_delete((kobj_t)sc->sc_md, M_RAID); sc->sc_md = NULL; } if (sc->sc_geom != NULL) { G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); g_topology_lock(); sc->sc_geom->softc = NULL; g_wither_geom(sc->sc_geom, ENXIO); g_topology_unlock(); sc->sc_geom = NULL; } else G_RAID_DEBUG(1, "Array destroyed."); if (worker) { g_raid_event_cancel(sc, sc); mtx_destroy(&sc->sc_queue_mtx); sx_xunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); wakeup(&sc->sc_stopping); free(sc, M_RAID); curthread->td_pflags &= ~TDP_GEOM; G_RAID_DEBUG(1, "Thread exiting."); kproc_exit(0); } else { /* Wake up worker to make it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_volume(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_disk *disk; int i; sc = vol->v_softc; G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); vol->v_stopping = 1; if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { if (vol->v_tr) { G_RAID_TR_STOP(vol->v_tr); return (EBUSY); } else vol->v_state = G_RAID_VOLUME_S_STOPPED; } if (g_raid_event_check(sc, vol) != 0) return (EBUSY); if (vol->v_provider != NULL) return (EBUSY); if (vol->v_provider_open != 0) return (EBUSY); if (vol->v_tr) { G_RAID_TR_FREE(vol->v_tr); kobj_delete((kobj_t)vol->v_tr, M_RAID); vol->v_tr = NULL; } if (vol->v_rootmount) root_mount_rel(vol->v_rootmount); g_topology_lock(); LIST_REMOVE(vol, v_global_next); g_topology_unlock(); TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { g_raid_event_cancel(sc, &vol->v_subdisks[i]); disk = vol->v_subdisks[i].sd_disk; if (disk == NULL) continue; TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); } G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); if (sc->sc_md) G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); g_raid_event_cancel(sc, vol); free(vol, M_RAID); if (sc->sc_stopping == G_RAID_DESTROY_HARD) { /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); } return (0); } int g_raid_destroy_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmp; sc = disk->d_softc; G_RAID_DEBUG1(2, sc, "Destroying disk."); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); sd->sd_disk = NULL; } TAILQ_REMOVE(&sc->sc_disks, disk, d_next); if (sc->sc_md) G_RAID_MD_FREE_DISK(sc->sc_md, disk); g_raid_event_cancel(sc, disk); free(disk, M_RAID); return (0); } int g_raid_destroy(struct g_raid_softc *sc, int how) { int error, opens; g_topology_assert_not(); if (sc == NULL) return (ENXIO); sx_assert(&sc->sc_lock, SX_XLOCKED); /* Count open volumes. */ opens = g_raid_nopens(sc); /* React on some opened volumes. */ if (opens > 0) { switch (how) { case G_RAID_DESTROY_SOFT: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_DELAYED: G_RAID_DEBUG1(1, sc, "Array will be destroyed on last close."); sc->sc_stopping = G_RAID_DESTROY_DELAYED; sx_xunlock(&sc->sc_lock); return (EBUSY); case G_RAID_DESTROY_HARD: G_RAID_DEBUG1(1, sc, "%d volumes are still open.", opens); } } /* Mark node for destruction. */ sc->sc_stopping = G_RAID_DESTROY_HARD; /* Wake up worker to let it selfdestruct. */ g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); /* Sleep until node destroyed. */ error = sx_sleep(&sc->sc_stopping, &sc->sc_lock, PRIBIO | PDROP, "r:destroy", hz * 3); return (error == EWOULDBLOCK ? EBUSY : 0); } static void g_raid_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static struct g_geom * g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) { struct g_consumer *cp; struct g_geom *gp, *geom; struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; g_topology_assert(); g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); if (!g_raid_enable) return (NULL); G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); geom = NULL; status = G_RAID_MD_TASTE_FAIL; gp = g_new_geomf(mp, "raid:taste"); /* * This orphan function should be never called. */ gp->orphan = g_raid_taste_orphan; cp = g_new_consumer(gp); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; if (g_attach(cp, pp) != 0) goto ofail2; if (g_access(cp, 1, 0, 0) != 0) goto ofail; LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (!class->mdc_enable) continue; G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", pp->name, class->name); obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_TASTE(obj, mp, cp, &geom); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); if (status != G_RAID_MD_TASTE_FAIL) break; } if (status == G_RAID_MD_TASTE_FAIL) (void)g_access(cp, -1, 0, 0); ofail: g_detach(cp); ofail2: g_destroy_consumer(cp); g_destroy_geom(gp); G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); return (geom); } int g_raid_create_node_format(const char *format, struct gctl_req *req, struct g_geom **gp) { struct g_raid_md_class *class; struct g_raid_md_object *obj; int status; G_RAID_DEBUG(2, "Creating array for %s metadata.", format); LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { if (strcasecmp(class->name, format) == 0) break; } if (class == NULL) { G_RAID_DEBUG(1, "No support for %s metadata.", format); return (G_RAID_MD_TASTE_FAIL); } obj = (void *)kobj_create((kobj_class_t)class, M_RAID, M_WAITOK); obj->mdo_class = class; status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp); if (status != G_RAID_MD_TASTE_NEW) kobj_delete((kobj_t)obj, M_RAID); return (status); } static int g_raid_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, struct g_geom *gp) { struct g_raid_softc *sc; int error; g_topology_unlock(); sc = gp->softc; sx_xlock(&sc->sc_lock); g_cancel_event(sc); error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); g_topology_lock(); return (error); } void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (sc->sc_stopping == G_RAID_DESTROY_HARD) return; if (sc->sc_md) G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); } void g_raid_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, struct g_raid_disk *disk) { if (disk == NULL) disk = sd->sd_disk; if (disk == NULL) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); return; } if (disk->d_state != G_RAID_DISK_S_ACTIVE) { G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); return; } if (sc->sc_md) G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); } static void g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, s; g_topology_assert(); sc = gp->softc; if (sc == NULL) return; if (pp != NULL) { vol = pp->private; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s %s volume\n", indent, sc->sc_md->mdo_class->name, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s\n", indent, vol->v_name); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_level2str(vol->v_raid_level, vol->v_raid_level_qualifier)); sbuf_printf(sb, "%s%s\n", indent, vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); sbuf_printf(sb, "%s%u\n", indent, vol->v_disks_count); sbuf_printf(sb, "%s%u\n", indent, vol->v_strip_size); sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(vol->v_state)); sbuf_printf(sb, "%s%s\n", indent, vol->v_dirty ? "Yes" : "No"); sbuf_printf(sb, "%s", indent); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk != NULL && sd->sd_disk->d_consumer != NULL) { sbuf_printf(sb, "%s ", g_raid_get_diskname(sd->sd_disk)); } else { sbuf_cat(sb, "NONE "); } sbuf_printf(sb, "(%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } sbuf_cat(sb, ")"); if (i + 1 < vol->v_disks_count) sbuf_cat(sb, ", "); } sbuf_cat(sb, "\n"); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else if (cp != NULL) { disk = cp->private; if (disk == NULL) return; g_topology_unlock(); sx_xlock(&sc->sc_lock); sbuf_printf(sb, "%s%s", indent, g_raid_disk_state2str(disk->d_state)); if (!TAILQ_EMPTY(&disk->d_subdisks)) { sbuf_cat(sb, " ("); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "%s", g_raid_subdisk_state2str(sd->sd_state)); if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { sbuf_printf(sb, " %d%%", (int)(sd->sd_rebuild_pos * 100 / sd->sd_size)); } if (TAILQ_NEXT(sd, sd_next)) sbuf_cat(sb, ", "); } sbuf_cat(sb, ")"); } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s", indent); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { sbuf_printf(sb, "r%d(%s):%d@%ju", sd->sd_volume->v_global_id, sd->sd_volume->v_name, sd->sd_pos, (uintmax_t)sd->sd_offset); if (TAILQ_NEXT(sd, sd_next)) sbuf_cat(sb, ", "); } sbuf_cat(sb, "\n"); sbuf_printf(sb, "%s%d\n", indent, disk->d_read_errs); sx_xunlock(&sc->sc_lock); g_topology_lock(); } else { g_topology_unlock(); sx_xlock(&sc->sc_lock); if (sc->sc_md) { sbuf_printf(sb, "%s%s\n", indent, sc->sc_md->mdo_class->name); } if (!TAILQ_EMPTY(&sc->sc_volumes)) { s = 0xff; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_state < s) s = vol->v_state; } sbuf_printf(sb, "%s%s\n", indent, g_raid_volume_state2str(s)); } sx_xunlock(&sc->sc_lock); g_topology_lock(); } } static void g_raid_shutdown_post_sync(void *arg, int howto) { struct g_class *mp; struct g_geom *gp, *gp2; struct g_raid_softc *sc; struct g_raid_volume *vol; mp = arg; g_topology_lock(); g_raid_shutdown = 1; LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { if ((sc = gp->softc) == NULL) continue; g_topology_unlock(); sx_xlock(&sc->sc_lock); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) g_raid_clean(vol, -1); g_cancel_event(sc); g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); g_topology_lock(); } g_topology_unlock(); } static void g_raid_init(struct g_class *mp) { g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); if (g_raid_post_sync == NULL) G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); g_raid_started = 1; } static void g_raid_fini(struct g_class *mp) { if (g_raid_post_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync); g_raid_started = 0; } int g_raid_md_modevent(module_t mod, int type, void *arg) { struct g_raid_md_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_md_classes); if (c == NULL || c->mdc_priority > class->mdc_priority) LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); else { while ((nc = LIST_NEXT(c, mdc_list)) != NULL && nc->mdc_priority < class->mdc_priority) c = nc; LIST_INSERT_AFTER(c, class, mdc_list); } if (g_raid_started) g_retaste(&g_raid_class); break; case MOD_UNLOAD: LIST_REMOVE(class, mdc_list); break; default: error = EOPNOTSUPP; break; } return (error); } int g_raid_tr_modevent(module_t mod, int type, void *arg) { struct g_raid_tr_class *class, *c, *nc; int error; error = 0; class = arg; switch (type) { case MOD_LOAD: c = LIST_FIRST(&g_raid_tr_classes); if (c == NULL || c->trc_priority > class->trc_priority) LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); else { while ((nc = LIST_NEXT(c, trc_list)) != NULL && nc->trc_priority < class->trc_priority) c = nc; LIST_INSERT_AFTER(c, class, trc_list); } break; case MOD_UNLOAD: LIST_REMOVE(class, trc_list); break; default: error = EOPNOTSUPP; break; } return (error); } /* * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) * to reduce module priority, allowing submodules to register them first. */ static moduledata_t g_raid_mod = { "g_raid", g_modevent, &g_raid_class }; DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); MODULE_VERSION(geom_raid, 0); diff --git a/sys/i386/i386/minidump_machdep_base.c b/sys/i386/i386/minidump_machdep_base.c index ce26e339d2be..5072474b9f8a 100644 --- a/sys/i386/i386/minidump_machdep_base.c +++ b/sys/i386/i386/minidump_machdep_base.c @@ -1,350 +1,350 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); #define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK) #define DEV_ALIGN(x) roundup2((off_t)(x), DEV_BSIZE) static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, dump_va, 0, fragsz); + error = dump_append(di, dump_va, fragsz); fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, i, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) { printf("address not page aligned\n"); return (EINVAL); } if (ptr != NULL) { /* If we're doing a virtual dump, flush any pre-existing pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; dumpsys_pb_progress(len); wdog_kern_pat(WD_LASTVAL); if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error) return (error); ptr += len; sz -= len; } else { for (i = 0; i < len; i += PAGE_SIZE) dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT); fragsz += len; pa += len; sz -= len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } /* A fake page table page, to avoid having to handle both 4K and 2M pages */ static pt_entry_t fakept[NPTEPG]; #ifdef PMAP_PAE_COMP #define cpu_minidumpsys cpu_minidumpsys_pae #define IdlePTD IdlePTD_pae #else #define cpu_minidumpsys cpu_minidumpsys_nopae #define IdlePTD IdlePTD_nopae #endif int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { uint64_t dumpsize; uint32_t ptesize; vm_offset_t va, kva_end; int error; uint64_t pa; pd_entry_t *pd, pde; pt_entry_t *pt, pte; int k; struct minidumphdr mdhdr; struct msgbuf *mbp; /* Snapshot the KVA upper bound in case it grows. */ kva_end = kernel_vm_end; /* * Walk the kernel page table pages, setting the active entries in the * dump bitmap. * * NB: for a live dump, we may be racing with updates to the page * tables, so care must be taken to read each entry only once. */ ptesize = 0; for (va = KERNBASE; va < kva_end; va += NBPDR) { /* * We always write a page, even if it is zero. Each * page written corresponds to 2MB of space */ ptesize += PAGE_SIZE; pd = IdlePTD; /* always mapped! */ pde = pte_load(&pd[va >> PDRSHIFT]); if ((pde & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is an entire 2M page. */ pa = pde & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); pa += PAGE_SIZE; } continue; } if ((pde & PG_V) == PG_V) { /* set bit for each valid page in this 2MB block */ pt = pmap_kenter_temporary(pde & PG_FRAME, 0); for (k = 0; k < NPTEPG; k++) { pte = pte_load(&pt[k]); if ((pte & PG_V) == PG_V) { pa = pte & PG_FRAME; if (vm_phys_is_dumpable(pa)) vm_page_dump_add( state->dump_bitset, pa); } } } else { /* nothing, we're going to dump a null page */ } } /* Calculate dump size. */ mbp = state->msgbufp; dumpsize = ptesize; dumpsize += round_page(mbp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { /* Clear out undumpable pages now if needed */ if (vm_phys_is_dumpable(pa)) { dumpsize += PAGE_SIZE; } else { vm_page_dump_drop(state->dump_bitset, pa); } } dumpsize += PAGE_SIZE; dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.ptesize = ptesize; mdhdr.kernbase = KERNBASE; mdhdr.paemode = pae_mode; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_I386_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Physical memory: %ju MB\n", ptoa((uintmax_t)physmem) / 1048576); printf("Dumping %llu MB:", (long long)dumpsize >> 20); /* Dump my header */ bzero(&fakept, sizeof(fakept)); bcopy(&mdhdr, &fakept, sizeof(mdhdr)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, (char *)mbp->msg_ptr, 0, round_page(mbp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(fakept), "Large dump_avail not handled"); bzero(fakept, sizeof(fakept)); memcpy(fakept, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page table pages */ for (va = KERNBASE; va < kva_end; va += NBPDR) { /* We always write a page, even if it is zero */ pd = IdlePTD; /* always mapped! */ pde = pte_load(&pd[va >> PDRSHIFT]); if ((pde & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is a single 2M block. Generate a fake PTP */ pa = pde & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M; } error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakept in the same block */ error = blk_flush(di); if (error) goto fail; continue; } if ((pde & PG_V) == PG_V) { pa = pde & PG_FRAME; error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } else { bzero(fakept, sizeof(fakept)); error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); if (error) goto fail; /* flush, in case we reuse fakept in the same block */ error = blk_flush(di); if (error) goto fail; } } /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) { printf("\nDump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } diff --git a/sys/kern/kern_dump.c b/sys/kern/kern_dump.c index 17ac4e418645..393c4983d08b 100644 --- a/sys/kern/kern_dump.c +++ b/sys/kern/kern_dump.c @@ -1,535 +1,535 @@ /*- * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); #define MD_ALIGN(x) roundup2((off_t)(x), PAGE_SIZE) /* Handle buffered writes. */ static size_t fragsz; struct dump_pa dump_map[DUMPSYS_MD_PA_NPAIRS]; #if !defined(__powerpc__) void dumpsys_gen_pa_init(void) { int n, idx; bzero(dump_map, sizeof(dump_map)); for (n = 0; n < nitems(dump_map); n++) { idx = n * 2; if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0) break; dump_map[n].pa_start = dump_avail[idx]; dump_map[n].pa_size = dump_avail[idx + 1] - dump_avail[idx]; } } #endif struct dump_pa * dumpsys_gen_pa_next(struct dump_pa *mdp) { if (mdp == NULL) return (&dump_map[0]); mdp++; if (mdp->pa_size == 0) mdp = NULL; return (mdp); } void dumpsys_gen_wbinv_all(void) { } void dumpsys_gen_unmap_chunk(vm_paddr_t pa __unused, size_t chunk __unused, void *va __unused) { } int dumpsys_gen_write_aux_headers(struct dumperinfo *di) { return (0); } int dumpsys_buf_seek(struct dumperinfo *di, size_t sz) { static uint8_t buf[DEV_BSIZE]; size_t nbytes; int error; bzero(buf, sizeof(buf)); while (sz > 0) { nbytes = MIN(sz, sizeof(buf)); - error = dump_append(di, buf, 0, nbytes); + error = dump_append(di, buf, nbytes); if (error) return (error); sz -= nbytes; } return (0); } int dumpsys_buf_write(struct dumperinfo *di, char *ptr, size_t sz) { size_t len; int error; while (sz) { len = di->blocksize - fragsz; if (len > sz) len = sz; memcpy((char *)di->blockbuf + fragsz, ptr, len); fragsz += len; ptr += len; sz -= len; if (fragsz == di->blocksize) { - error = dump_append(di, di->blockbuf, 0, di->blocksize); + error = dump_append(di, di->blockbuf, di->blocksize); if (error) return (error); fragsz = 0; } } return (0); } int dumpsys_buf_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, di->blockbuf, 0, di->blocksize); + error = dump_append(di, di->blockbuf, di->blocksize); fragsz = 0; return (error); } CTASSERT(PAGE_SHIFT < 20); #define PG2MB(pgs) ((pgs + (1 << (20 - PAGE_SHIFT)) - 1) >> (20 - PAGE_SHIFT)) int dumpsys_cb_dumpdata(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; vm_paddr_t pa; void *va; uint64_t pgs; size_t counter, sz, chunk; int c, error; u_int maxdumppgs; error = 0; /* catch case in which chunk size is 0 */ counter = 0; /* Update twiddle every 16MB */ va = NULL; pgs = mdp->pa_size / PAGE_SIZE; pa = mdp->pa_start; maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS); if (maxdumppgs == 0) /* seatbelt */ maxdumppgs = 1; printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs), (uintmax_t)pgs); dumpsys_wbinv_all(); while (pgs) { chunk = pgs; if (chunk > maxdumppgs) chunk = maxdumppgs; sz = chunk << PAGE_SHIFT; counter += sz; if (counter >> 24) { printf(" %ju", (uintmax_t)PG2MB(pgs)); counter &= (1 << 24) - 1; } dumpsys_map_chunk(pa, chunk, &va); wdog_kern_pat(WD_LASTVAL); - error = dump_append(di, va, 0, sz); + error = dump_append(di, va, sz); dumpsys_unmap_chunk(pa, chunk, va); if (error) break; pgs -= chunk; pa += sz; /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } printf(" ... %s\n", (error) ? "fail" : "ok"); return (error); } int dumpsys_foreach_chunk(dumpsys_callback_t cb, void *arg) { struct dump_pa *mdp; int error, seqnr; seqnr = 0; mdp = dumpsys_pa_next(NULL); while (mdp != NULL) { error = (*cb)(mdp, seqnr++, arg); if (error) return (-error); mdp = dumpsys_pa_next(mdp); } return (seqnr); } static off_t fileofs; static int cb_dumphdr(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; Elf_Phdr phdr; uint64_t size; int error; size = mdp->pa_size; bzero(&phdr, sizeof(phdr)); phdr.p_type = PT_LOAD; phdr.p_flags = PF_R; /* XXX */ phdr.p_offset = fileofs; #ifdef __powerpc__ phdr.p_vaddr = (do_minidump? mdp->pa_start : ~0L); phdr.p_paddr = (do_minidump? ~0L : mdp->pa_start); #else phdr.p_vaddr = mdp->pa_start; phdr.p_paddr = mdp->pa_start; #endif phdr.p_filesz = size; phdr.p_memsz = size; phdr.p_align = PAGE_SIZE; error = dumpsys_buf_write(di, (char*)&phdr, sizeof(phdr)); fileofs += phdr.p_filesz; return (error); } static int cb_size(struct dump_pa *mdp, int seqnr, void *arg) { uint64_t *sz; sz = (uint64_t *)arg; *sz += (uint64_t)mdp->pa_size; return (0); } int dumpsys_generic(struct dumperinfo *di) { static struct kerneldumpheader kdh; Elf_Ehdr ehdr; uint64_t dumpsize; off_t hdrgap; size_t hdrsz; int error; #if MINIDUMP_PAGE_TRACKING == 1 if (do_minidump) return (minidumpsys(di, false)); #endif bzero(&ehdr, sizeof(ehdr)); ehdr.e_ident[EI_MAG0] = ELFMAG0; ehdr.e_ident[EI_MAG1] = ELFMAG1; ehdr.e_ident[EI_MAG2] = ELFMAG2; ehdr.e_ident[EI_MAG3] = ELFMAG3; ehdr.e_ident[EI_CLASS] = ELF_CLASS; #if BYTE_ORDER == LITTLE_ENDIAN ehdr.e_ident[EI_DATA] = ELFDATA2LSB; #else ehdr.e_ident[EI_DATA] = ELFDATA2MSB; #endif ehdr.e_ident[EI_VERSION] = EV_CURRENT; ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */ ehdr.e_type = ET_CORE; ehdr.e_machine = EM_VALUE; ehdr.e_phoff = sizeof(ehdr); ehdr.e_flags = 0; ehdr.e_ehsize = sizeof(ehdr); ehdr.e_phentsize = sizeof(Elf_Phdr); ehdr.e_shentsize = sizeof(Elf_Shdr); dumpsys_pa_init(); /* Calculate dump size. */ dumpsize = 0L; ehdr.e_phnum = dumpsys_foreach_chunk(cb_size, &dumpsize) + DUMPSYS_NUM_AUX_HDRS; hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize; fileofs = MD_ALIGN(hdrsz); dumpsize += fileofs; hdrgap = fileofs - roundup2((off_t)hdrsz, di->blocksize); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_ARCH_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %ju MB (%d chunks)\n", (uintmax_t)dumpsize >> 20, ehdr.e_phnum - DUMPSYS_NUM_AUX_HDRS); /* Dump ELF header */ error = dumpsys_buf_write(di, (char*)&ehdr, sizeof(ehdr)); if (error) goto fail; /* Dump program headers */ error = dumpsys_foreach_chunk(cb_dumphdr, di); if (error < 0) goto fail; error = dumpsys_write_aux_headers(di); if (error < 0) goto fail; dumpsys_buf_flush(di); /* * All headers are written using blocked I/O, so we know the * current offset is (still) block aligned. Skip the alignement * in the file to have the segment contents aligned at page * boundary. */ error = dumpsys_buf_seek(di, (size_t)hdrgap); if (error) goto fail; /* Dump memory chunks. */ error = dumpsys_foreach_chunk(dumpsys_cb_dumpdata, di); if (error < 0) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == E2BIG || error == ENOSPC) printf("\nDump failed. Partition too small.\n"); else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } #if MINIDUMP_PAGE_TRACKING == 1 /* Minidump progress bar */ static struct { const int min_per; const int max_per; bool visited; } progress_track[10] = { { 0, 10, false}, { 10, 20, false}, { 20, 30, false}, { 30, 40, false}, { 40, 50, false}, { 50, 60, false}, { 60, 70, false}, { 70, 80, false}, { 80, 90, false}, { 90, 100, false} }; static uint64_t dumpsys_pb_size; static uint64_t dumpsys_pb_remaining; static uint64_t dumpsys_pb_check; /* Reset the progress bar for a dump of dumpsize. */ void dumpsys_pb_init(uint64_t dumpsize) { int i; dumpsys_pb_size = dumpsys_pb_remaining = dumpsize; dumpsys_pb_check = 0; for (i = 0; i < nitems(progress_track); i++) progress_track[i].visited = false; } /* * Update the progress according to the delta bytes that were written out. * Check and print the progress percentage. */ void dumpsys_pb_progress(size_t delta) { int sofar, i; dumpsys_pb_remaining -= delta; dumpsys_pb_check += delta; /* * To save time while dumping, only loop through progress_track * occasionally. */ if ((dumpsys_pb_check >> DUMPSYS_PB_CHECK_BITS) == 0) return; else dumpsys_pb_check &= (1 << DUMPSYS_PB_CHECK_BITS) - 1; sofar = 100 - ((dumpsys_pb_remaining * 100) / dumpsys_pb_size); for (i = 0; i < nitems(progress_track); i++) { if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per) continue; if (!progress_track[i].visited) { progress_track[i].visited = true; printf("..%d%%", sofar); } break; } } int minidumpsys(struct dumperinfo *di, bool livedump) { struct minidumpstate state; struct msgbuf mb_copy; char *msg_ptr; size_t sz; int error; if (livedump) { KASSERT(!dumping, ("live dump invoked from incorrect context")); /* * Before invoking cpu_minidumpsys() on the live system, we * must snapshot some required global state: the message * buffer, and the page dump bitset. They may be modified at * any moment, so for the sake of the live dump it is best to * have an unchanging snapshot to work with. Both are included * as part of the dump and consumed by userspace tools. * * Other global state important to the minidump code is the * dump_avail array and the kernel's page tables, but snapshots * are not taken of these. For one, dump_avail[] is expected * not to change after boot. Snapshotting the kernel page * tables would involve an additional walk, so this is avoided * too. * * This means live dumps are best effort, and the result may or * may not be usable; there are no guarantees about the * consistency of the dump's contents. Any of the following * (and likely more) may affect the live dump: * * - Data may be modified, freed, or remapped during the * course of the dump, such that the contents written out * are partially or entirely unrecognizable. This means * valid references may point to destroyed/mangled objects, * and vice versa. * * - The dumped context of any threads that ran during the * dump process may be unreliable. * * - The set of kernel page tables included in the dump likely * won't correspond exactly to the copy of the dump bitset. * This means some pages will be dumped without any way to * locate them, and some pages may not have been dumped * despite appearing as if they should. */ msg_ptr = malloc(msgbufsize, M_TEMP, M_WAITOK); msgbuf_duplicate(msgbufp, &mb_copy, msg_ptr); state.msgbufp = &mb_copy; sz = BITSET_SIZE(vm_page_dump_pages); state.dump_bitset = malloc(sz, M_TEMP, M_WAITOK); BIT_COPY_STORE_REL(sz, vm_page_dump, state.dump_bitset); } else { KASSERT(dumping, ("minidump invoked outside of doadump()")); /* Use the globals. */ state.msgbufp = msgbufp; state.dump_bitset = vm_page_dump; } error = cpu_minidumpsys(di, &state); if (livedump) { free(msg_ptr, M_TEMP); free(state.dump_bitset, M_TEMP); } return (error); } #endif /* MINIDUMP_PAGE_TRACKING == 1 */ diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index b5433e4ed55f..c78b19ca454a 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -1,1843 +1,1838 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_printf.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); static bool debugger_on_recursive_panic = false; SYSCTL_BOOL(_debug, OID_AUTO, debugger_on_recursive_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_recursive_panic, 0, "Run debugger on recursive kernel panic"); int debugger_on_trap = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_trap, 0, "Run debugger on kernel trap before panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; static bool trace_all_panics = true; #else static int trace_on_panic = 0; static bool trace_all_panics = false; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN, &trace_all_panics, 0, "Print stack traces on secondary kernel panics"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static bool poweroff_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN, &poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic"); static bool powercycle_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN, &powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, "Show busy buffers during shutdown"); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; union { struct { keyInstance aes_ki; cipherInstance aes_ci; } u_aes; struct chacha_ctx u_chacha; } u; #define kdc_ki u.u_aes.aes_ki #define kdc_ci u.u_aes.aes_ci #define kdc_chacha u.u_chacha uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif struct kerneldumpcomp { uint8_t kdc_format; struct compressor *kdc_stream; uint8_t *kdc_buf; size_t kdc_resid; }; static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression); static void kerneldumpcomp_destroy(struct dumperinfo *di); static int kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg); static int kerneldump_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN, &kerneldump_gzlevel, 0, "Kernel crash dump compression level"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; bool __read_frequently panicked; int __read_mostly dumping; /* system is dumping */ int rebooting; /* system is rebooting */ /* * Used to serialize between sysctl kern.shutdown.dumpdevname and list * modifications via ioctl. */ static struct mtx dumpconf_list_lk; MTX_SYSINIT(dumper_configs, &dumpconf_list_lk, "dumper config list", MTX_DEF); /* Our selected dumper(s). */ static TAILQ_HEAD(dumpconflist, dumperinfo) dumper_configs = TAILQ_HEAD_INITIALIZER(dumper_configs); /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); else kern_reboot(uap->opt); } return (error); } static void shutdown_nice_task_fn(void *arg, int pending __unused) { int howto; howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if ((howto & RB_POWEROFF) != 0) { BOOTTRACE("SIGUSR2 to init(8)"); kern_psignal(initproc, SIGUSR2); } else if ((howto & RB_POWERCYCLE) != 0) { BOOTTRACE("SIGWINCH to init(8)"); kern_psignal(initproc, SIGWINCH); } else if ((howto & RB_HALT) != 0) { BOOTTRACE("SIGUSR1 to init(8)"); kern_psignal(initproc, SIGUSR1); } else { BOOTTRACE("SIGINT to init(8)"); kern_psignal(initproc, SIGINT); } PROC_UNLOCK(initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, &shutdown_nice_task_fn, NULL); /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL && !SCHEDULER_STOPPED()) { BOOTTRACE("shutdown initiated"); shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } /* * Set up a context that can be extracted from the dump. */ void dump_savectx(void) { savectx(&dumppcb); dumptid = curthread->td_tid; } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (TAILQ_EMPTY(&dumper_configs)) return (ENXIO); dump_savectx(); dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(TAILQ_FIRST(&dumper_configs)); } #endif if (coredump) { struct dumperinfo *di; TAILQ_FOREACH(di, &dumper_configs, di_next) { error = dumpsys(di); if (error == 0) break; } } dumping--; return (error); } /* * Trace the shutdown reason. */ static void reboottrace(int howto) { if ((howto & RB_DUMP) != 0) { if ((howto & RB_HALT) != 0) BOOTTRACE("system panic: halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system panic: powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system panic: rebooting..."); } else { if ((howto & RB_HALT) != 0) BOOTTRACE("system halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system rebooting..."); } } /* * kern_reboot(9): Shut down the system cleanly to prepare for reboot, halt, or * power off. */ void kern_reboot(int howto) { static int once = 0; if (initproc != NULL && curproc != initproc) BOOTTRACE("kernel shutdown (dirty) started"); else BOOTTRACE("kernel shutdown (clean) started"); /* * Normal paths here don't hold Giant, but we can wind up here * unexpectedly with it held. Drop it now so we don't have to * drop and pick it up elsewhere. The paths it is locking will * never be returned to, and it is preferable to preclude * deadlock than to lock against code that won't ever * continue. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); #if defined(SMP) /* * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("%s: not running on cpu 0", __func__)); } #endif /* We're in the process of rebooting. */ rebooting = 1; reboottrace(howto); /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); BOOTTRACE("shutdown pre sync complete"); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; BOOTTRACE("bufshutdown begin"); bufshutdown(show_busybufs); BOOTTRACE("bufshutdown end"); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); BOOTTRACE("shutdown post sync complete"); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ BOOTTRACE("shutdown final begin"); if (shutdown_trace) boottrace_dump_console(); EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp); return (ENOENT); } if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); wdog_kern_pat(WD_TO_NEVER); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: break; } } } /* * Check to see if the system panicked, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "kassert options"); #ifdef KASSERT_PANIC_OPTIONAL #define KASSERT_RWTUN CTLFLAG_RWTUN #else #define KASSERT_RWTUN CTLFLAG_RDTUN #endif SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (0) or just a warning (1)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN, &kassert_do_log, 0, "If warn_only is enabled, log (1) or do not log (0) assertion violations"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN, &kassert_suppress_in_panic, 0, "KASSERTs will be suppressed while handling a panic"); #undef KASSERT_RWTUN static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } #ifdef KASSERT_PANIC_OPTIONAL /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * If we are suppressing secondary panics, log the warning but do not * re-enter panic/kdb. */ if (panicstr != NULL && kassert_suppress_in_panic) { if (kassert_do_log) { printf("KASSERT failed: %s\n", buf); #ifdef KDB if (trace_all_panics && trace_on_panic) kdb_backtrace(); #endif } return; } /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* KASSERT_PANIC_OPTIONAL */ #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } #endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; panicked = true; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif printf("time = %jd\n", (intmax_t )time_second); #ifdef KDB if ((newpanic || trace_all_panics) && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); else if (!newpanic && debugger_on_recursive_panic) kdb_enter(KDB_WHY_PANIC, "re-panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; if (poweroff_on_panic) bootopt |= RB_POWEROFF; if (powercycle_on_panic) bootopt |= RB_POWERCYCLE; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static int dumpdevname_sysctl_handler(SYSCTL_HANDLER_ARGS) { char buf[256]; struct dumperinfo *di; struct sbuf sb; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH(di, &dumper_configs, di_next) { if (di != TAILQ_FIRST(&dumper_configs)) sbuf_putc(&sb, ','); sbuf_cat(&sb, di->di_devname); } mtx_unlock(&dumpconf_list_lk); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_shutdown, OID_AUTO, dumpdevname, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, &dumper_configs, 0, dumpdevname_sysctl_handler, "A", "Device(s) for kernel dumps"); -static int _dump_append(struct dumperinfo *di, void *virtual, - vm_offset_t physical, size_t length); +static int _dump_append(struct dumperinfo *di, void *virtual, size_t length); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; case KERNELDUMP_ENC_CHACHA20: chacha_keysetup(&kdc->kdc_chacha, key, 256); break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: zfree(kdc, M_EKCD); return (NULL); } static int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; case KERNELDUMP_ENC_CHACHA20: chacha_ivsetup(&kdc->kdc_chacha, kdc->kdc_iv, NULL); break; default: error = EINVAL; goto out; } kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); } static uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); } #endif /* EKCD */ static struct kerneldumpcomp * kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression) { struct kerneldumpcomp *kdcomp; int format; switch (compression) { case KERNELDUMP_COMP_GZIP: format = COMPRESS_GZIP; break; case KERNELDUMP_COMP_ZSTD: format = COMPRESS_ZSTD; break; default: return (NULL); } kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO); kdcomp->kdc_format = compression; kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb, format, di->maxiosize, kerneldump_gzlevel, di); if (kdcomp->kdc_stream == NULL) { free(kdcomp, M_DUMPER); return (NULL); } kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP); return (kdcomp); } static void kerneldumpcomp_destroy(struct dumperinfo *di) { struct kerneldumpcomp *kdcomp; kdcomp = di->kdcomp; if (kdcomp == NULL) return; compressor_fini(kdcomp->kdc_stream); zfree(kdcomp->kdc_buf, M_DUMPER); free(kdcomp, M_DUMPER); } /* * Free a dumper. Must not be present on global list. */ void dumper_destroy(struct dumperinfo *di) { if (di == NULL) return; zfree(di->blockbuf, M_DUMPER); kerneldumpcomp_destroy(di); #ifdef EKCD zfree(di->kdcrypto, M_EKCD); #endif zfree(di, M_DUMPER); } /* * Allocate and set up a new dumper from the provided template. */ int dumper_create(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda, struct dumperinfo **dip) { struct dumperinfo *newdi; int error = 0; if (dip == NULL) return (EINVAL); /* Allocate a new dumper */ newdi = malloc(sizeof(*newdi) + strlen(devname) + 1, M_DUMPER, M_WAITOK | M_ZERO); memcpy(newdi, di_template, sizeof(*newdi)); newdi->blockbuf = NULL; newdi->kdcrypto = NULL; newdi->kdcomp = NULL; strcpy(newdi->di_devname, devname); if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD newdi->kdcrypto = kerneldumpcrypto_create(newdi->blocksize, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, kda->kda_encryptedkey); if (newdi->kdcrypto == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } if (kda->kda_compression != KERNELDUMP_COMP_NONE) { #ifdef EKCD /* * We can't support simultaneous unpadded block cipher * encryption and compression because there is no guarantee the * length of the compressed result is exactly a multiple of the * cipher block size. */ if (kda->kda_encryption == KERNELDUMP_ENC_AES_256_CBC) { error = EOPNOTSUPP; goto cleanup; } #endif newdi->kdcomp = kerneldumpcomp_create(newdi, kda->kda_compression); if (newdi->kdcomp == NULL) { error = EINVAL; goto cleanup; } } newdi->blockbuf = malloc(newdi->blocksize, M_DUMPER, M_WAITOK | M_ZERO); *dip = newdi; return (0); cleanup: dumper_destroy(newdi); return (error); } /* * Create a new dumper and register it in the global list. */ int dumper_insert(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *newdi, *listdi; bool inserted; uint8_t index; int error; index = kda->kda_index; MPASS(index != KDA_REMOVE && index != KDA_REMOVE_DEV && index != KDA_REMOVE_ALL); error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); error = dumper_create(di_template, devname, kda, &newdi); if (error != 0) return (error); /* Add the new configuration to the queue */ mtx_lock(&dumpconf_list_lk); inserted = false; TAILQ_FOREACH(listdi, &dumper_configs, di_next) { if (index == 0) { TAILQ_INSERT_BEFORE(listdi, newdi, di_next); inserted = true; break; } index--; } if (!inserted) TAILQ_INSERT_TAIL(&dumper_configs, newdi, di_next); mtx_unlock(&dumpconf_list_lk); return (0); } #ifdef DDB void dumper_ddb_insert(struct dumperinfo *newdi) { TAILQ_INSERT_HEAD(&dumper_configs, newdi, di_next); } void dumper_ddb_remove(struct dumperinfo *di) { TAILQ_REMOVE(&dumper_configs, di, di_next); } #endif static bool dumper_config_match(const struct dumperinfo *di, const char *devname, const struct diocskerneldump_arg *kda) { if (kda->kda_index == KDA_REMOVE_ALL) return (true); if (strcmp(di->di_devname, devname) != 0) return (false); /* * Allow wildcard removal of configs matching a device on g_dev_orphan. */ if (kda->kda_index == KDA_REMOVE_DEV) return (true); if (di->kdcomp != NULL) { if (di->kdcomp->kdc_format != kda->kda_compression) return (false); } else if (kda->kda_compression != KERNELDUMP_COMP_NONE) return (false); #ifdef EKCD if (di->kdcrypto != NULL) { if (di->kdcrypto->kdc_encryption != kda->kda_encryption) return (false); /* * Do we care to verify keys match to delete? It seems weird * to expect multiple fallback dump configurations on the same * device that only differ in crypto key. */ } else #endif if (kda->kda_encryption != KERNELDUMP_ENC_NONE) return (false); return (true); } /* * Remove and free the requested dumper(s) from the global list. */ int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *di, *sdi; bool found; int error; error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); /* * Try to find a matching configuration, and kill it. * * NULL 'kda' indicates remove any configuration matching 'devname', * which may remove multiple configurations in atypical configurations. */ found = false; mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH_SAFE(di, &dumper_configs, di_next, sdi) { if (dumper_config_match(di, devname, kda)) { found = true; TAILQ_REMOVE(&dumper_configs, di, di_next); dumper_destroy(di); } } mtx_unlock(&dumpconf_list_lk); /* Only produce ENOENT if a more targeted match didn't match. */ if (!found && kda->kda_index == KDA_REMOVE) return (ENOENT); return (0); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { if (di->kdcomp != NULL && offset >= di->mediaoffset) { printf( "Compressed dump failed to fit in device boundaries.\n"); return (E2BIG); } printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } if (length % di->blocksize != 0) { printf("Attempt to write partial block of length %ju.\n", (uintmax_t)length); return (EINVAL); } if (offset % di->blocksize != 0) { printf("Attempt to write at unaligned offset %jd.\n", (intmax_t)offset); return (EINVAL); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; case KERNELDUMP_ENC_CHACHA20: chacha_encrypt_bytes(&kdc->kdc_chacha, buf, buf, size); break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int -dump_encrypted_write(struct dumperinfo *di, void *virtual, - vm_offset_t physical, off_t offset, size_t length) +dump_encrypted_write(struct dumperinfo *di, void *virtual, off_t offset, + size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; kdc = di->kdcrypto; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); - error = dump_write(di, buf, physical, offset, nbytes); + error = dump_write(di, buf, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } return (0); } #endif /* EKCD */ static int kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg) { struct dumperinfo *di; size_t resid, rlength; int error; di = arg; if (length % di->blocksize != 0) { /* * This must be the final write after flushing the compression * stream. Write as many full blocks as possible and stash the * residual data in the dumper's block buffer. It will be * padded and written in dump_finish(). */ rlength = rounddown(length, di->blocksize); if (rlength != 0) { - error = _dump_append(di, base, 0, rlength); + error = _dump_append(di, base, rlength); if (error != 0) return (error); } resid = length - rlength; memmove(di->blockbuf, (uint8_t *)base + rlength, resid); bzero((uint8_t *)di->blockbuf + resid, di->blocksize - resid); di->kdcomp->kdc_resid = resid; return (EAGAIN); } - return (_dump_append(di, base, 0, length)); + return (_dump_append(di, base, length)); } /* * Write kernel dump headers at the beginning and end of the dump extent. * Write the kernel dump encryption key after the leading header if we were * configured to do so. */ static int dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *buf; size_t hdrsz; uint64_t extent; uint32_t keysize; int error; hdrsz = sizeof(*kdh); if (hdrsz > di->blocksize) return (ENOMEM); #ifdef EKCD kdc = di->kdcrypto; keysize = kerneldumpcrypto_dumpkeysize(kdc); #else keysize = 0; #endif /* * If the dump device has special handling for headers, let it take care * of writing them out. */ if (di->dumper_hdr != NULL) return (di->dumper_hdr(di, kdh)); if (hdrsz == di->blocksize) buf = kdh; else { buf = di->blockbuf; memset(buf, 0, di->blocksize); memcpy(buf, kdh, hdrsz); } extent = dtoh64(kdh->dumpextent); #ifdef EKCD if (kdc != NULL) { - error = dump_write(di, kdc->kdc_dumpkey, 0, + error = dump_write(di, kdc->kdc_dumpkey, di->mediaoffset + di->mediasize - di->blocksize - extent - keysize, keysize); if (error != 0) return (error); } #endif - error = dump_write(di, buf, 0, + error = dump_write(di, buf, di->mediaoffset + di->mediasize - 2 * di->blocksize - extent - keysize, di->blocksize); if (error == 0) - error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize - + error = dump_write(di, buf, di->mediaoffset + di->mediasize - di->blocksize, di->blocksize); return (error); } /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is to * protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64 * 1024) /* * Do some preliminary setup for a kernel dump: initialize state for encryption, * if requested, and make sure that we have enough space on the dump device. * * We set things up so that the dump ends before the last sector of the dump * device, at which the trailing header is written. * * +-----------+------+-----+----------------------------+------+ * | | lhdr | key | ... kernel dump ... | thdr | * +-----------+------+-----+----------------------------+------+ * 1 blk opt <------- dump extent --------> 1 blk * * Dumps written using dump_append() start at the beginning of the extent. * Uncompressed dumps will use the entire extent, but compressed dumps typically * will not. The true length of the dump is recorded in the leading and trailing * headers once the dump has been completed. * * The dump device may provide a callback, in which case it will initialize * dumpoff and take care of laying out the headers. */ int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *key; uint64_t dumpextent, span; uint32_t keysize; int error; #ifdef EKCD /* Send the key before the dump so a partial dump is still usable. */ kdc = di->kdcrypto; error = kerneldumpcrypto_init(kdc); if (error != 0) return (error); keysize = kerneldumpcrypto_dumpkeysize(kdc); key = keysize > 0 ? kdc->kdc_dumpkey : NULL; #else error = 0; keysize = 0; key = NULL; #endif if (di->dumper_start != NULL) { error = di->dumper_start(di, key, keysize); } else { dumpextent = dtoh64(kdh->dumpextent); span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize + keysize; if (di->mediasize < span) { if (di->kdcomp == NULL) return (E2BIG); /* * We don't yet know how much space the compressed dump * will occupy, so try to use the whole swap partition * (minus the first 64KB) in the hope that the * compressed dump will fit. If that doesn't turn out to * be enough, the bounds checking in dump_write() * will catch us and cause the dump to fail. */ dumpextent = di->mediasize - span + dumpextent; kdh->dumpextent = htod64(dumpextent); } /* * The offset at which to begin writing the dump. */ di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize - dumpextent; } di->origdumpoff = di->dumpoff; return (error); } static int -_dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, - size_t length) +_dump_append(struct dumperinfo *di, void *virtual, size_t length) { int error; #ifdef EKCD if (di->kdcrypto != NULL) - error = dump_encrypted_write(di, virtual, physical, di->dumpoff, - length); + error = dump_encrypted_write(di, virtual, di->dumpoff, length); else #endif - error = dump_write(di, virtual, physical, di->dumpoff, length); + error = dump_write(di, virtual, di->dumpoff, length); if (error == 0) di->dumpoff += length; return (error); } /* * Write to the dump device starting at dumpoff. When compression is enabled, * writes to the device will be performed using a callback that gets invoked * when the compression stream's output buffer is full. */ int -dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, - size_t length) +dump_append(struct dumperinfo *di, void *virtual, size_t length) { void *buf; if (di->kdcomp != NULL) { /* Bounce through a buffer to avoid CRC errors. */ if (length > di->maxiosize) return (EINVAL); buf = di->kdcomp->kdc_buf; memmove(buf, virtual, length); return (compressor_write(di->kdcomp->kdc_stream, buf, length)); } - return (_dump_append(di, virtual, physical, length)); + return (_dump_append(di, virtual, length)); } /* * Write to the dump device at the specified offset. */ int -dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, - off_t offset, size_t length) +dump_write(struct dumperinfo *di, void *virtual, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, offset, length)); } /* * Perform kernel dump finalization: flush the compression stream, if necessary, * write the leading and trailing kernel dump headers now that we know the true * length of the dump, and optionally write the encryption key following the * leading header. */ int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh) { int error; if (di->kdcomp != NULL) { error = compressor_flush(di->kdcomp->kdc_stream); if (error == EAGAIN) { /* We have residual data in di->blockbuf. */ - error = _dump_append(di, di->blockbuf, 0, di->blocksize); + error = _dump_append(di, di->blockbuf, di->blocksize); if (error == 0) /* Compensate for _dump_append()'s adjustment. */ di->dumpoff -= di->blocksize - di->kdcomp->kdc_resid; di->kdcomp->kdc_resid = 0; } if (error != 0) return (error); /* * We now know the size of the compressed dump, so update the * header accordingly and recompute parity. */ kdh->dumplength = htod64(di->dumpoff - di->origdumpoff); kdh->parity = 0; kdh->parity = kerneldump_parity(kdh); compressor_reset(di->kdcomp->kdc_stream); } error = dump_write_headers(di, kdh); if (error != 0) return (error); - (void)dump_write(di, NULL, 0, 0, 0); + (void)dump_write(di, NULL, 0, 0); return (0); } void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, const char *magic, uint32_t archver, uint64_t dumplen) { size_t dstsize; bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumpextent = kdh->dumplength; kdh->dumptime = htod64(time_second); #ifdef EKCD kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto)); #else kdh->dumpkeysize = 0; #endif kdh->blocksize = htod32(di->blocksize); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); dstsize = sizeof(kdh->versionstring); if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize) kdh->versionstring[dstsize - 2] = '\n'; if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); if (di->kdcomp != NULL) kdh->compression = di->kdcomp->kdc_format; kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND(panic, db_show_panic) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif diff --git a/sys/powerpc/powerpc/minidump_machdep.c b/sys/powerpc/powerpc/minidump_machdep.c index 396c6487823e..d713e11f2ab2 100644 --- a/sys/powerpc/powerpc/minidump_machdep.c +++ b/sys/powerpc/powerpc/minidump_machdep.c @@ -1,339 +1,339 @@ /*- * Copyright (c) 2019 Leandro Lupori * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Debugging stuff */ #define MINIDUMP_DEBUG 0 #if MINIDUMP_DEBUG #define dprintf(fmt, ...) printf(fmt, ## __VA_ARGS__) #define DBG(...) __VA_ARGS__ static size_t total, dumptotal; static void dump_total(const char *id, size_t sz); #else #define dprintf(fmt, ...) #define DBG(...) #define dump_total(...) #endif extern vm_offset_t __startkernel, __endkernel; static int dump_retry_count = 5; SYSCTL_INT(_machdep, OID_AUTO, dump_retry_count, CTLFLAG_RWTUN, &dump_retry_count, 0, "Number of times dump has to retry before bailing out"); static struct kerneldumpheader kdh; static char pgbuf[PAGE_SIZE]; static size_t dumpsize; /* Handle chunked writes. */ static size_t fragsz; static void pmap_kenter_temporary(vm_offset_t va, vm_paddr_t pa) { pmap_kremove(va); pmap_kenter(va, pa); } static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, crashdumpmap, 0, fragsz); + error = dump_append(di, crashdumpmap, fragsz); DBG(dumptotal += fragsz;) fragsz = 0; return (error); } static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len, maxdumpsz; int error, i, c; maxdumpsz = MIN(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("Size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("Can't have both va and pa!\n"); return (EINVAL); } if ((pa % PAGE_SIZE) != 0) { printf("Address not page aligned 0x%lx\n", pa); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any pre-existing * pa pages */ error = blk_flush(di); if (error) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; dumpsys_pb_progress(len); if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error) return (error); DBG(dumptotal += len;) ptr += len; } else { for (i = 0; i < len; i += PAGE_SIZE) pmap_kenter_temporary( (vm_offset_t)crashdumpmap + fragsz + i, pa + i); fragsz += len; pa += len; if (fragsz == maxdumpsz) { error = blk_flush(di); if (error) return (error); } } sz -= len; /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } static int dump_pmap(struct dumperinfo *di) { void *ctx; char *buf; u_long nbytes; int error; ctx = dumpsys_dump_pmap_init(sizeof(pgbuf) / PAGE_SIZE); for (;;) { buf = dumpsys_dump_pmap(ctx, pgbuf, &nbytes); if (buf == NULL) break; error = blk_write(di, buf, 0, nbytes); if (error) return (error); } return (0); } int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { vm_paddr_t pa; int error, retry_count; uint32_t pmapsize; struct minidumphdr mdhdr; struct msgbuf *mbp; retry_count = 0; retry: retry_count++; fragsz = 0; DBG(total = dumptotal = 0;) /* Build set of dumpable pages from kernel pmap */ pmapsize = dumpsys_scan_pmap(state->dump_bitset); if (pmapsize % PAGE_SIZE != 0) { printf("pmapsize not page aligned: 0x%x\n", pmapsize); return (EINVAL); } /* Calculate dump size */ mbp = state->msgbufp; dumpsize = PAGE_SIZE; /* header */ dumpsize += round_page(mbp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); dumpsize += pmapsize; VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { /* Clear out undumpable pages now if needed */ if (vm_phys_is_dumpable(pa)) dumpsize += PAGE_SIZE; else vm_page_dump_drop(state->dump_bitset, pa); } dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); strncpy(mdhdr.mmu_name, pmap_mmu_name(), sizeof(mdhdr.mmu_name) - 1); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS; mdhdr.kernend = VM_MAX_SAFE_KERNEL_ADDRESS; mdhdr.dmapbase = DMAP_BASE_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.hw_direct_map = hw_direct_map; mdhdr.startkernel = __startkernel; mdhdr.endkernel = __endkernel; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_POWERPC_VERSION, dumpsize); error = dump_start(di, &kdh); if (error) goto fail; printf("Dumping %lu out of %ju MB:", dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump minidump header */ bzero(pgbuf, sizeof(pgbuf)); memcpy(pgbuf, &mdhdr, sizeof(mdhdr)); error = blk_write(di, pgbuf, 0, PAGE_SIZE); if (error) goto fail; dump_total("header", PAGE_SIZE); /* Dump msgbuf up front */ error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); dump_total("msgbuf", round_page(mbp->msg_size)); /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(pgbuf), "Large dump_avail not handled"); bzero(pgbuf, sizeof(mdhdr)); memcpy(pgbuf, dump_avail, sizeof(dump_avail)); error = blk_write(di, pgbuf, 0, PAGE_SIZE); if (error) goto fail; dump_total("dump_avail", round_page(sizeof(dump_avail))); /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; dump_total("bitmap", round_page(BITSET_SIZE(vm_page_dump_pages))); /* Dump kernel page directory pages */ error = dump_pmap(di); if (error) goto fail; dump_total("pmap", pmapsize); /* Dump memory chunks */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; dump_total("mem_chunks", dumpsize - total); error = dump_finish(di, &kdh); if (error) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < dump_retry_count) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) printf("Dump failed. Partition too small.\n"); else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } #if MINIDUMP_DEBUG static void dump_total(const char *id, size_t sz) { total += sz; dprintf("\n%s=%08lx/%08lx/%08lx\n", id, sz, total, dumptotal); } #endif diff --git a/sys/riscv/riscv/minidump_machdep.c b/sys/riscv/riscv/minidump_machdep.c index ee6faaa6680f..9a47bb31a6f7 100644 --- a/sys/riscv/riscv/minidump_machdep.c +++ b/sys/riscv/riscv/minidump_machdep.c @@ -1,370 +1,370 @@ /*- * Copyright (c) 2006 Peter Wemm * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2019 Mitchell Horne * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); static struct kerneldumpheader kdh; /* Handle chunked writes. */ static size_t fragsz; static void *dump_va; static size_t dumpsize; static uint64_t tmpbuffer[PAGE_SIZE / sizeof(uint64_t)]; static int blk_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); - error = dump_append(di, dump_va, 0, fragsz); + error = dump_append(di, dump_va, fragsz); fragsz = 0; return (error); } /* * Write a block of data to the dump file. * * Caller can provide data through a pointer or by specifying its * physical address. * * XXX writes using pa should be no larger than PAGE_SIZE. */ static int blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz) { size_t len; int error, c; u_int maxdumpsz; maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE); if (maxdumpsz == 0) /* seatbelt */ maxdumpsz = PAGE_SIZE; error = 0; if ((sz % PAGE_SIZE) != 0) { printf("size not page aligned\n"); return (EINVAL); } if (ptr != NULL && pa != 0) { printf("cant have both va and pa!\n"); return (EINVAL); } if ((((uintptr_t)pa) % PAGE_SIZE) != 0) { printf("address not page aligned %#lx\n", (uintptr_t)pa); return (EINVAL); } if (ptr != NULL) { /* * If we're doing a virtual dump, flush any * pre-existing pa pages. */ error = blk_flush(di); if (error != 0) return (error); } while (sz) { len = maxdumpsz - fragsz; if (len > sz) len = sz; dumpsys_pb_progress(len); wdog_kern_pat(WD_LASTVAL); if (ptr) { - error = dump_append(di, ptr, 0, len); + error = dump_append(di, ptr, len); if (error != 0) return (error); ptr += len; sz -= len; } else { dump_va = (void *)PHYS_TO_DMAP(pa); fragsz += len; pa += len; sz -= len; error = blk_flush(di); if (error != 0) return (error); } /* Check for user abort */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } return (0); } int cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state) { pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e; struct minidumphdr mdhdr; struct msgbuf *mbp; uint32_t pmapsize; vm_offset_t va, kva_max; vm_paddr_t pa; int error; int i; int retry_count; retry_count = 0; retry: retry_count++; error = 0; pmapsize = 0; /* Snapshot the KVA upper bound in case it grows. */ kva_max = kernel_vm_end; /* * Walk the kernel page table pages, setting the active entries in the * dump bitmap. * * NB: for a live dump, we may be racing with updates to the page * tables, so care must be taken to read each entry only once. */ for (va = VM_MIN_KERNEL_ADDRESS; va < kva_max; va += L2_SIZE) { pmapsize += PAGE_SIZE; if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3)) continue; /* We should always be using the l2 table for kvm */ if (l2 == NULL) continue; /* l2 may be a superpage */ l2e = atomic_load_64(l2); if ((l2e & PTE_RWX) != 0) { pa = (l2e >> PTE_PPN1_S) << L2_SHIFT; for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) { if (vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); } } else { for (i = 0; i < Ln_ENTRIES; i++) { l3e = atomic_load_64(&l3[i]); if ((l3e & PTE_V) == 0) continue; pa = (l3e >> PTE_PPN0_S) * PAGE_SIZE; if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) vm_page_dump_add(state->dump_bitset, pa); } } } /* Calculate dump size */ mbp = state->msgbufp; dumpsize = pmapsize; dumpsize += round_page(mbp->msg_size); dumpsize += round_page(sizeof(dump_avail)); dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages)); VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { /* Clear out undumpable pages now if needed */ if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) dumpsize += PAGE_SIZE; else vm_page_dump_drop(state->dump_bitset, pa); } dumpsize += PAGE_SIZE; dumpsys_pb_init(dumpsize); /* Initialize mdhdr */ bzero(&mdhdr, sizeof(mdhdr)); strcpy(mdhdr.magic, MINIDUMP_MAGIC); mdhdr.version = MINIDUMP_VERSION; mdhdr.msgbufsize = mbp->msg_size; mdhdr.bitmapsize = round_page(BITSET_SIZE(vm_page_dump_pages)); mdhdr.pmapsize = pmapsize; mdhdr.kernbase = KERNBASE; mdhdr.dmapphys = DMAP_MIN_PHYSADDR; mdhdr.dmapbase = DMAP_MIN_ADDRESS; mdhdr.dmapend = DMAP_MAX_ADDRESS; mdhdr.dumpavailsize = round_page(sizeof(dump_avail)); dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_RISCV_VERSION, dumpsize); error = dump_start(di, &kdh); if (error != 0) goto fail; printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20, ptoa((uintmax_t)physmem) / 1048576); /* Dump minidump header */ bzero(&tmpbuffer, sizeof(tmpbuffer)); bcopy(&mdhdr, &tmpbuffer, sizeof(mdhdr)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump msgbuf up front */ error = blk_write(di, mbp->msg_ptr, 0, round_page(mbp->msg_size)); if (error) goto fail; /* Dump dump_avail */ _Static_assert(sizeof(dump_avail) <= sizeof(tmpbuffer), "Large dump_avail not handled"); bzero(tmpbuffer, sizeof(tmpbuffer)); memcpy(tmpbuffer, dump_avail, sizeof(dump_avail)); error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Dump bitmap */ error = blk_write(di, (char *)vm_page_dump, 0, round_page(BITSET_SIZE(vm_page_dump_pages))); if (error) goto fail; /* Dump kernel page directory pages */ bzero(&tmpbuffer, sizeof(tmpbuffer)); for (va = VM_MIN_KERNEL_ADDRESS; va < kva_max; va += L2_SIZE) { if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3)) { /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Flush, in case we reuse tmpbuffer in the same block */ error = blk_flush(di); if (error) goto fail; continue; } l2e = atomic_load_64(l2); if ((l2e & PTE_RWX) != 0) { /* Generate fake l3 entries based on the l2 superpage */ for (i = 0; i < Ln_ENTRIES; i++) { tmpbuffer[i] = (l2e | (i << PTE_PPN0_S)); } /* We always write a page, even if it is zero */ error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; /* Flush, in case we reuse tmpbuffer in the same block */ error = blk_flush(di); if (error) goto fail; bzero(&tmpbuffer, sizeof(tmpbuffer)); } else { pa = (l2e >> PTE_PPN0_S) * PAGE_SIZE; /* * We always write a page, even if it is zero. If pa * is malformed, write the zeroed tmpbuffer. */ if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) error = blk_write(di, NULL, pa, PAGE_SIZE); else error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE); if (error) goto fail; } } /* Dump memory chunks */ /* XXX cluster it up and use blk_dump() */ VM_PAGE_DUMP_FOREACH(state->dump_bitset, pa) { error = blk_write(di, 0, pa, PAGE_SIZE); if (error) goto fail; } error = blk_flush(di); if (error) goto fail; error = dump_finish(di, &kdh); if (error != 0) goto fail; printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; printf("\n"); if (error == ENOSPC) { printf("Dump map grown while dumping. "); if (retry_count < 5) { printf("Retrying...\n"); goto retry; } printf("Dump failed.\n"); } else if (error == ECANCELED) printf("Dump aborted\n"); else if (error == E2BIG) { printf("Dump failed. Partition too small (about %lluMB were " "needed this time).\n", (long long)dumpsize >> 20); } else printf("** DUMP FAILED (ERROR %d) **\n", error); return (error); } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 37265c6988ce..9f60dcaf62fa 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -1,387 +1,387 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2000 * Poul-Henning Kamp. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)conf.h 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_CONF_H_ #define _SYS_CONF_H_ #ifdef _KERNEL #include #else #include #endif struct snapdata; struct devfs_dirent; struct cdevsw; struct file; struct cdev { void *si_spare0; u_int si_flags; #define SI_ETERNAL 0x0001 /* never destroyed */ #define SI_ALIAS 0x0002 /* carrier of alias name */ #define SI_NAMED 0x0004 /* make_dev{_alias} has been called */ #define SI_UNUSED1 0x0008 /* unused */ #define SI_CHILD 0x0010 /* child of another struct cdev **/ #define SI_DUMPDEV 0x0080 /* is kernel dumpdev */ #define SI_CLONELIST 0x0200 /* on a clone list */ #define SI_UNMAPPED 0x0400 /* can handle unmapped I/O */ #define SI_NOSPLIT 0x0800 /* I/O should not be split up */ struct timespec si_atime; struct timespec si_ctime; struct timespec si_mtime; uid_t si_uid; gid_t si_gid; mode_t si_mode; struct ucred *si_cred; /* cached clone-time credential */ int si_drv0; int si_refcount; LIST_ENTRY(cdev) si_list; LIST_ENTRY(cdev) si_clone; LIST_HEAD(, cdev) si_children; LIST_ENTRY(cdev) si_siblings; struct cdev *si_parent; struct mount *si_mountpt; void *si_drv1, *si_drv2; struct cdevsw *si_devsw; int si_iosize_max; /* maximum I/O size (for physio &al) */ u_long si_usecount; u_long si_threadcount; union { struct snapdata *__sid_snapdata; } __si_u; char si_name[SPECNAMELEN + 1]; }; #define si_snapdata __si_u.__sid_snapdata #ifdef _KERNEL /* * Definitions of device driver entry switches */ struct bio; struct buf; struct dumperinfo; struct kerneldumpheader; struct thread; struct uio; struct knote; struct clonedevs; struct vm_object; struct vnode; typedef int d_open_t(struct cdev *dev, int oflags, int devtype, struct thread *td); typedef int d_fdopen_t(struct cdev *dev, int oflags, struct thread *td, struct file *fp); typedef int d_close_t(struct cdev *dev, int fflag, int devtype, struct thread *td); typedef void d_strategy_t(struct bio *bp); typedef int d_ioctl_t(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); typedef int d_read_t(struct cdev *dev, struct uio *uio, int ioflag); typedef int d_write_t(struct cdev *dev, struct uio *uio, int ioflag); typedef int d_poll_t(struct cdev *dev, int events, struct thread *td); typedef int d_kqfilter_t(struct cdev *dev, struct knote *kn); typedef int d_mmap_t(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr); typedef int d_mmap_single_t(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot); typedef void d_purge_t(struct cdev *dev); typedef int dumper_t( void *_priv, /* Private to the driver. */ void *_virtual, /* Virtual (mapped) address. */ off_t _offset, /* Byte-offset to write at. */ size_t _length); /* Number of bytes to dump. */ typedef int dumper_start_t(struct dumperinfo *di, void *key, uint32_t keysize); typedef int dumper_hdr_t(struct dumperinfo *di, struct kerneldumpheader *kdh); #endif /* _KERNEL */ /* * Types for d_flags. */ #define D_TAPE 0x0001 #define D_DISK 0x0002 #define D_TTY 0x0004 #define D_MEM 0x0008 /* /dev/(k)mem */ /* Defined uid and gid values. */ #define UID_ROOT 0 #define UID_BIN 3 #define UID_UUCP 66 #define UID_NOBODY 65534 #define GID_WHEEL 0 #define GID_KMEM 2 #define GID_TTY 4 #define GID_OPERATOR 5 #define GID_BIN 7 #define GID_GAMES 13 #define GID_VIDEO 44 #define GID_RT_PRIO 47 #define GID_ID_PRIO 48 #define GID_DIALER 68 #define GID_NOGROUP 65533 #define GID_NOBODY 65534 #ifdef _KERNEL #define D_TYPEMASK 0xffff /* * Flags for d_flags which the drivers can set. */ #define D_TRACKCLOSE 0x00080000 /* track all closes */ #define D_MMAP_ANON 0x00100000 /* special treatment in vm_mmap.c */ #define D_GIANTOK 0x00200000 /* suppress warning about using Giant */ #define D_NEEDGIANT 0x00400000 /* driver want Giant */ #define D_NEEDMINOR 0x00800000 /* driver uses clone_create() */ /* * Version numbers. */ #define D_VERSION_00 0x20011966 #define D_VERSION_01 0x17032005 /* Add d_uid,gid,mode & kind */ #define D_VERSION_02 0x28042009 /* Add d_mmap_single */ #define D_VERSION_03 0x17122009 /* d_mmap takes memattr,vm_ooffset_t */ #define D_VERSION_04 0x5c48c353 /* SPECNAMELEN bumped to MAXNAMLEN */ #define D_VERSION D_VERSION_04 /* * Flags used for internal housekeeping */ #define D_INIT 0x80000000 /* cdevsw initialized */ /* * Character device switch table */ struct cdevsw { int d_version; u_int d_flags; const char *d_name; d_open_t *d_open; d_fdopen_t *d_fdopen; d_close_t *d_close; d_read_t *d_read; d_write_t *d_write; d_ioctl_t *d_ioctl; d_poll_t *d_poll; d_mmap_t *d_mmap; d_strategy_t *d_strategy; void *d_spare0; d_kqfilter_t *d_kqfilter; d_purge_t *d_purge; d_mmap_single_t *d_mmap_single; int32_t d_spare1[3]; void *d_spare2[3]; /* These fields should not be messed with by drivers */ LIST_HEAD(, cdev) d_devs; int d_spare3; union { struct cdevsw *gianttrick; SLIST_ENTRY(cdevsw) postfree_list; } __d_giant; }; #define d_gianttrick __d_giant.gianttrick #define d_postfree_list __d_giant.postfree_list struct module; struct devsw_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ /* Do not initialize fields hereafter */ }; #define DEV_MODULE_ORDERED(name, evh, arg, ord) \ static moduledata_t name##_mod = { \ #name, \ evh, \ arg \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, ord) #define DEV_MODULE(name, evh, arg) \ DEV_MODULE_ORDERED(name, evh, arg, SI_ORDER_MIDDLE) void clone_setup(struct clonedevs **cdp); void clone_cleanup(struct clonedevs **); #define CLONE_UNITMASK 0xfffff #define CLONE_FLAG0 (CLONE_UNITMASK + 1) int clone_create(struct clonedevs **, struct cdevsw *, int *unit, struct cdev **dev, int extra); #define MAKEDEV_REF 0x01 #define MAKEDEV_WHTOUT 0x02 #define MAKEDEV_NOWAIT 0x04 #define MAKEDEV_WAITOK 0x08 #define MAKEDEV_ETERNAL 0x10 #define MAKEDEV_CHECKNAME 0x20 struct make_dev_args { size_t mda_size; int mda_flags; struct cdevsw *mda_devsw; struct ucred *mda_cr; uid_t mda_uid; gid_t mda_gid; int mda_mode; int mda_unit; void *mda_si_drv1; void *mda_si_drv2; }; void make_dev_args_init_impl(struct make_dev_args *_args, size_t _sz); #define make_dev_args_init(a) \ make_dev_args_init_impl((a), sizeof(struct make_dev_args)) void delist_dev(struct cdev *_dev); void destroy_dev(struct cdev *_dev); int destroy_dev_sched(struct cdev *dev); int destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg); void destroy_dev_drain(struct cdevsw *csw); void drain_dev_clone_events(void); struct cdevsw *dev_refthread(struct cdev *_dev, int *_ref); struct cdevsw *devvn_refthread(struct vnode *vp, struct cdev **devp, int *_ref); void dev_relthread(struct cdev *_dev, int _ref); void dev_depends(struct cdev *_pdev, struct cdev *_cdev); void dev_ref(struct cdev *dev); void dev_refl(struct cdev *dev); void dev_rel(struct cdev *dev); struct cdev *make_dev(struct cdevsw *_devsw, int _unit, uid_t _uid, gid_t _gid, int _perms, const char *_fmt, ...) __printflike(6, 7); struct cdev *make_dev_cred(struct cdevsw *_devsw, int _unit, struct ucred *_cr, uid_t _uid, gid_t _gid, int _perms, const char *_fmt, ...) __printflike(7, 8); struct cdev *make_dev_credf(int _flags, struct cdevsw *_devsw, int _unit, struct ucred *_cr, uid_t _uid, gid_t _gid, int _mode, const char *_fmt, ...) __printflike(8, 9); int make_dev_p(int _flags, struct cdev **_cdev, struct cdevsw *_devsw, struct ucred *_cr, uid_t _uid, gid_t _gid, int _mode, const char *_fmt, ...) __printflike(8, 9); int make_dev_s(struct make_dev_args *_args, struct cdev **_cdev, const char *_fmt, ...) __printflike(3, 4); struct cdev *make_dev_alias(struct cdev *_pdev, const char *_fmt, ...) __printflike(2, 3); int make_dev_alias_p(int _flags, struct cdev **_cdev, struct cdev *_pdev, const char *_fmt, ...) __printflike(4, 5); int make_dev_physpath_alias(int _flags, struct cdev **_cdev, struct cdev *_pdev, struct cdev *_old_alias, const char *_physpath); void dev_lock(void); void dev_unlock(void); #ifdef KLD_MODULE #define MAKEDEV_ETERNAL_KLD 0 #else #define MAKEDEV_ETERNAL_KLD MAKEDEV_ETERNAL #endif #define dev2unit(d) ((d)->si_drv0) typedef void d_priv_dtor_t(void *data); int devfs_get_cdevpriv(void **datap); int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *dtr); void devfs_clear_cdevpriv(void); ino_t devfs_alloc_cdp_inode(void); void devfs_free_cdp_inode(ino_t ino); typedef void (*dev_clone_fn)(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **result); int dev_stdclone(char *_name, char **_namep, const char *_stem, int *_unit); EVENTHANDLER_DECLARE(dev_clone, dev_clone_fn); /* Stuff relating to kernel-dump */ struct kerneldumpcrypto; struct kerneldumpheader; struct dumperinfo { dumper_t *dumper; /* Dumping function. */ dumper_start_t *dumper_start; /* Dumper callback for dump_start(). */ dumper_hdr_t *dumper_hdr; /* Dumper callback for writing headers. */ void *priv; /* Private parts. */ u_int blocksize; /* Size of block in bytes. */ u_int maxiosize; /* Max size allowed for an individual I/O */ off_t mediaoffset; /* Initial offset in bytes. */ off_t mediasize; /* Space available in bytes. */ /* MI kernel dump state. */ void *blockbuf; /* Buffer for padding shorter dump blocks */ off_t dumpoff; /* Offset of ongoing kernel dump. */ off_t origdumpoff; /* Starting dump offset. */ struct kerneldumpcrypto *kdcrypto; /* Kernel dump crypto. */ struct kerneldumpcomp *kdcomp; /* Kernel dump compression. */ TAILQ_ENTRY(dumperinfo) di_next; char di_devname[]; }; extern int dumping; /* system is dumping */ void dump_savectx(void); int doadump(boolean_t); struct diocskerneldump_arg; int dumper_create(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda, struct dumperinfo **dip); void dumper_destroy(struct dumperinfo *di); int dumper_insert(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda); int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda); /* For ddb(4)-time use only. */ void dumper_ddb_insert(struct dumperinfo *); void dumper_ddb_remove(struct dumperinfo *); int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh); -int dump_append(struct dumperinfo *, void *, vm_offset_t, size_t); -int dump_write(struct dumperinfo *, void *, vm_offset_t, off_t, size_t); +int dump_append(struct dumperinfo *, void *, size_t); +int dump_write(struct dumperinfo *, void *, off_t, size_t); int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh); void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, const char *magic, uint32_t archver, uint64_t dumplen); #endif /* _KERNEL */ #endif /* !_SYS_CONF_H_ */