diff --git a/sys/compat/freebsd32/syscalls.conf b/sys/compat/freebsd32/syscalls.conf --- a/sys/compat/freebsd32/syscalls.conf +++ b/sys/compat/freebsd32/syscalls.conf @@ -48,10 +48,11 @@ # Syscalls without implementations: # __mac_* - should be implemented # afs3_syscall - requires significant porting, probably doesn't make sense +# kexec_load - makes little sense on 64-bit hardware # kldsym - can't be implemented (kernel virtual addresses can't fit in 32-bits) # lgetfh - should be implemented # nlm_syscall - requires significant porting, probably doesn't make sense # nnpfs_syscall - requires significant porting, probably doesn't make sense # ntp_gettime - should be implemented # thr_create - was unimplemented and appears to be unnecessary -unimpl="afs3_syscall kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" +unimpl="afs3_syscall kexec_load kldsym __mac_get_proc __mac_set_proc __mac_get_fd __mac_get_file __mac_set_fd __mac_set_file __mac_get_pid __mac_get_link __mac_set_link __mac_execve nfssvc nlm_syscall ntp_gettime lgetfh nnpfs_syscall thr_create" diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3811,6 +3811,7 @@ kern/kern_jailmeta.c standard kern/kern_kcov.c optional kcov \ compile-with "${NOSAN_C} ${MSAN_CFLAGS}" +kern/kern_kexec.c standard kern/kern_khelp.c standard kern/kern_kthread.c standard kern/kern_ktr.c optional ktr diff --git a/sys/kern/kern_kexec.c b/sys/kern/kern_kexec.c new file mode 100644 --- /dev/null +++ b/sys/kern/kern_kexec.c @@ -0,0 +1,350 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#ifdef INTRNG +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef KEXEC_MD_PAGES +/* + * Number of MD pages for extra bookkeeping. + * This is a macro because it can be a constant (some architectures make it 0). + * It accepts an argument, which is an array of + * kexec_segment[KEXEC_SEGMENT_MAX]. + */ +#define KEXEC_MD_PAGES(x) 0 +#endif + +/* + * Basic design: + * + * Given an array of "segment descriptors" stage an image to be loaded and + * jumped to at reboot, instead of rebooting via firmware. + * + * Constraints: + * - The segment descriptors' "mem" and "memsz" must each fit within a + * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. + * A single segment cannot span multiple vm_phys_seg segments, even if the + * vm_phys_seg segments are adjacent. + * + * Technical details: + * + * Take advantage of the VM subsystem and create a vm_object to hold the staged + * image. When grabbing pages for the object, sort the pages so that if a page + * in the object is located in the physical range of any of the kexec segment + * targets then it gets placed at the pindex corresponding to that physical + * address. This avoids the chance of corruption by writing over the page in + * the final copy, or the need for a copy buffer page. + */ + +static struct kexec_image staged_image; +static vm_offset_t stage_addr; +static vm_object_t kexec_obj; + +static eventhandler_tag kexec_reboot_handler; +static struct mtx kexec_mutex; + +static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); + + +static void +kexec_reboot(void *junk __unused, int howto) +{ + if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) + return; + +#ifdef SMP + cpu_mp_stop(); +#endif /* SMP */ + intr_disable(); + printf("Starting kexec reboot\n"); + + scheduler_stopped = true; + kexec_reboot_md(&staged_image); +} + +MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); + +/* Sort the segment list once copied in */ +static int +seg_cmp(const void *seg1, const void *seg2) +{ + const struct kexec_segment *s1, *s2; + + s1 = seg1; + s2 = seg2; + + return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); +} + +static bool +segment_fits(struct kexec_segment *seg) +{ + vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; + + for (int i = 0; i < vm_phys_nsegs; i++) { + if (v >= vm_phys_segs[i].start && + (v + seg->memsz - 1) <= vm_phys_segs[i].end) + return (true); + } + + return (false); +} + +static vm_paddr_t +pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) +{ + for (int i = count; i > 0; --i) { + if (pind >= segs[i - 1].pindex) + return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); + } + + panic("No segment for pindex %ju\n", (uintmax_t)pind); +} + +/* + * For now still tied to the system call, so assumes all memory is userspace. + */ +int +kern_kexec_load(struct thread *td, u_long entry, u_long nseg, + struct kexec_segment *seg, u_long flags) +{ + static int kexec_loading; + struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; + struct kexec_image *new_image_stage = 0; + vm_object_t new_segments = NULL; + uint8_t *buf; + int err = 0; + int i; + const size_t segsize = nseg * sizeof(struct kexec_segment); + vm_page_t *page_list = 0; + vm_size_t image_count, md_pages, page_count, tmpsize; + vm_offset_t segment_va = 0; + /* + * - Do any sanity checking + * - Load the new segments to temporary + * - Remove the old segments + * - Install the new segments + */ + + if (nseg > KEXEC_SEGMENT_MAX) + return (EINVAL); + + if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) + return (EBUSY); + + /* Only do error checking if we're installing new segments. */ + if (nseg > 0) { + /* Create the new kexec object before destroying the old one. */ + bzero(&segtmp, sizeof(segtmp)); + err = copyin(seg, segtmp, segsize); + if (err != 0) + goto out; + qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); + new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); + /* + * Sanity checking: + * - All segments must not overlap the kernel, so must be fully enclosed + * in a vm_phys_seg (each kexec segment must be in a single + * vm_phys_seg segment, cannot cross even adjacent segments). + */ + image_count = 0; + for (i = 0; i < nseg; i++) { + if (!segment_fits(&segtmp[i]) || + segtmp[i].bufsz > segtmp[i].memsz) { + err = EINVAL; + goto out; + } + new_image_stage->segments[i].pindex = image_count; + new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; + new_image_stage->segments[i].size = segtmp[i].memsz; + image_count += atop(segtmp[i].memsz); + } + md_pages = KEXEC_MD_PAGES(segtmp); + page_count = image_count + md_pages; + new_segments = vm_object_allocate(OBJT_PHYS, page_count); + page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); + + /* + * - Grab all pages for all segments (use pindex to slice it) + * - Walk the list (once) + * - At each pindex, check if the target PA that corresponds + * to that index is in the object. If so, swap the pages. + * - At the end of this the list will be "best" sorted. + */ + vm_page_grab_pages_unlocked(new_segments, 0, + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, + page_list, page_count); + + /* Sort the pages to best match the PA */ + VM_OBJECT_WLOCK(new_segments); + for (i = 0; i < image_count; i++) { + vm_page_t curpg, otherpg, tmp; + vm_pindex_t otheridx; + + curpg = page_list[i]; + otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, + nseg, curpg->pindex)); + otheridx = otherpg->pindex; + + if (otherpg->object == new_segments) { + /* + * Swap 'curpg' and 'otherpg', since 'otherpg' + * is at the PA 'curpg' covers. + */ + vm_radix_remove(&new_segments->rtree, otheridx); + vm_radix_remove(&new_segments->rtree, i); + otherpg->pindex = i; + curpg->pindex = otheridx; + vm_radix_insert(&new_segments->rtree, curpg); + vm_radix_insert(&new_segments->rtree, otherpg); + tmp = curpg; + page_list[i] = otherpg; + page_list[otheridx] = tmp; + } + } + for (i = 0; i < nseg; i++) { + new_image_stage->segments[i].first_page = + vm_radix_lookup(&new_segments->rtree, + new_image_stage->segments[i].pindex); + } + if (md_pages > 0) + new_image_stage->first_md_page = + vm_radix_lookup(&new_segments->rtree, + page_count - md_pages); + else + new_image_stage->first_md_page = NULL; + VM_OBJECT_WUNLOCK(new_segments); + + /* Map the object to do the copies */ + err = vm_map_find(kernel_map, new_segments, 0, &segment_va, + ptoa(page_count), 0, VMFS_ANY_SPACE, + VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); + if (err != 0) + goto out; + buf = (void *)segment_va; + new_image_stage->map_addr = segment_va; + new_image_stage->map_size = ptoa(new_segments->size); + new_image_stage->entry = entry; + new_image_stage->map_obj = new_segments; + for (i = 0; i < nseg; i++) { + err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); + if (err != 0) { + goto out; + } + new_image_stage->segments[i].map_buf = buf; + buf += segtmp[i].bufsz; + tmpsize = segtmp[i].memsz - segtmp[i].bufsz; + if (tmpsize > 0) + memset(buf, 0, tmpsize); + buf += tmpsize; + } + /* What's left are the MD pages, so zero them all out. */ + if (md_pages > 0) + bzero(buf, ptoa(md_pages)); + + cpu_flush_dcache((void *)segment_va, ptoa(page_count)); + if ((err = kexec_load_md(new_image_stage)) != 0) + goto out; + } + if (kexec_obj != NULL) { + vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); + KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); + vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); + } + kexec_obj = new_segments; + bzero(&staged_image, sizeof(staged_image)); + if (nseg > 0) + memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); + + printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); + if (nseg > 0) { + if (kexec_reboot_handler == NULL) + kexec_reboot_handler = + EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, + SHUTDOWN_PRI_DEFAULT - 150); + } else { + if (kexec_reboot_handler != NULL) + EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); + } +out: + /* Clean up the mess if we've gotten far. */ + if (err != 0 && new_segments != NULL) { + vm_object_unwire(new_segments, 0, new_segments->size, 0); + if (segment_va != 0) + vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); + else + vm_object_deallocate(new_segments); + } + atomic_store_rel_int(&kexec_loading, false); + if (new_image_stage != NULL) + free(new_image_stage, M_TEMP); + if (page_list != 0) + free(page_list, M_TEMP); + + return (err); +} + +int +sys_kexec_load(struct thread *td, struct kexec_load_args *uap) +{ + int error; + + // FIXME: Do w need a better privilege check than PRIV_REBOOT here? + error = priv_check(td, PRIV_REBOOT); + if (error != 0) + return (error); + return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); +} diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3384,4 +3384,12 @@ ); } +597 AUE_NULL STD { + int kexec_load( + uint64_t entry, + u_long nseg, + _In_reads_(nseg) _Contains_long_ptr_ struct kexec_segment *segments, + u_long flags + ); + } ; vim: syntax=off diff --git a/sys/sys/kexec.h b/sys/sys/kexec.h new file mode 100644 --- /dev/null +++ b/sys/sys/kexec.h @@ -0,0 +1,81 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_KEXEC_H_ +#define _SYS_KEXEC_H_ + +#include + +struct kexec_segment { + void *buf; + size_t bufsz; + vm_paddr_t mem; + vm_size_t memsz; +}; + +/* Flags (aligned with Linux) */ +#define KEXEC_ON_CRASH 0x1 + +/* Aligned with Linux's limit */ +#define KEXEC_SEGMENT_MAX 16 + +#ifdef _KERNEL +struct kexec_segment_stage { + vm_page_t first_page; + void *map_buf; + vm_paddr_t target; + vm_size_t size; + vm_pindex_t pindex; +}; + +struct kexec_image { + struct kexec_segment_stage segments[KEXEC_SEGMENT_MAX]; + vm_paddr_t entry; + struct vm_object *map_obj; /* Containing object */ + vm_offset_t map_addr; /* Mapped in kernel space */ + vm_size_t map_size; + vm_page_t first_md_page; + void *md_image; +}; + +#endif + +#ifndef _KERNEL + +__BEGIN_DECLS +int kexec_load(uint64_t, unsigned long, struct kexec_segment *, unsigned long); +__END_DECLS + +#else + +void kexec_reboot_md(struct kexec_image *); +int kexec_load_md(struct kexec_image *); + +#endif + +#endif diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h --- a/sys/sys/reboot.h +++ b/sys/sys/reboot.h @@ -61,6 +61,7 @@ #define RB_REROOT 0x200000 /* unmount the rootfs and mount it again */ #define RB_POWERCYCLE 0x400000 /* Power cycle if possible */ #define RB_MUTEMSGS 0x800000 /* start up with console muted after banner */ +#define RB_KEXEC 0x1000000 /* Boot new kernel using kexec */ #define RB_PROBE 0x10000000 /* Probe multiple consoles */ #define RB_MULTIPLE 0x20000000 /* use multiple consoles */ diff --git a/sys/sys/smp.h b/sys/sys/smp.h --- a/sys/sys/smp.h +++ b/sys/sys/smp.h @@ -251,6 +251,7 @@ int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); +void cpu_mp_stop(void); /* Go back to single-CPU */ void forward_signal(struct thread *); int restart_cpus(cpuset_t); diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -47,6 +47,7 @@ struct jail; struct kevent; struct kevent_copyops; +struct kexec_segment; struct kld_file_stat; struct ksiginfo; struct mbuf; @@ -400,6 +401,8 @@ int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv); int kern_unmount(struct thread *td, const char *path, int flags); +int kern_kexec_load(struct thread *td, u_long entry, + u_long nseg, struct kexec_segment *seg, u_long flags); /* flags for kern_sigaction */ #define KSA_OSIGSET 0x0001 /* uses osigact_t */