Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 327425) +++ head/sys/kern/init_main.c (revision 327426) @@ -1,875 +1,882 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; EVENTHANDLER_LIST_DECLARE(process_init); EVENTHANDLER_LIST_DECLARE(thread_init); EVENTHANDLER_LIST_DECLARE(process_ctor); EVENTHANDLER_LIST_DECLARE(thread_ctor); /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif + TSENTER(); + if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } + TSEXIT(); /* Here so we don't overlap with start_init. */ + mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; + TSENTER(); /* Here so we don't overlap with mi_startup. */ + td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == EJUSTRETURN) { mtx_unlock(&Giant); + TSEXIT(); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: head/sys/kern/vfs_mountroot.c =================================================================== --- head/sys/kern/vfs_mountroot.c (revision 327425) +++ head/sys/kern/vfs_mountroot.c (revision 327426) @@ -1,1111 +1,1119 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); static int root_mount_always_wait = 0; SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN, &root_mount_always_wait, 0, "Wait for root mount holds even if the root device already exists"); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); LIST_FOREACH(h, &root_holds, list) { if (h != LIST_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&root_holds_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL) return; mtx_lock(&root_holds_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); free(h, M_DEVBUF); } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("Cannot find root vnode"); VOP_UNLOCK(rootvnode, 0); p = curthread->td_proc; FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_XUNLOCK(p->p_fd); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mporoot, true); if (mporoot != mpdevfs) cache_purgevfs(mpdevfs, true); VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vporoot); vporoot->v_mountedhere = NULL; mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); cache_purgevfs(rootvnode->v_mount, true); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td); error = namei(&nd); } if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); mporoot->mnt_vnodecovered = vp; vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VOP_UNLOCK(vp, 0); } else vput(vp); } NDFREE(&nd, NDF_ONLY_PNBUF); if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (!error) { vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); vpdevfs->v_mountedhere = NULL; vrele(vpdevfs); } mpdevfs->mnt_vnodecovered = vp; vp->v_mountedhere = mpdevfs; VOP_UNLOCK(vp, 0); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); NDFREE(&nd, NDF_ONLY_PNBUF); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_unlinkat(td, AT_FDCWD, "/dev/dev", UIO_SYSSPACE, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:tank\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); PICKUP_GIANT(); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; DROP_GIANT(); error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); PICKUP_GIANT(); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); error = kern_close(td, fd); out: free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static int parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread); error = namei(&nd); if (!error) vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error != 0) ? 0 : 1; } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int error; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; sbuf_printf(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) sbuf_printf(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); sbuf_printf(sb, ".timeout 0\n"); sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) sbuf_printf(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct timeval lastfail; int curfail; + TSENTER(); + curfail = 0; while (1) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&root_holds_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); } + + TSEXIT(); } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. Also do the wait if the user forced that * behaviour by setting vfs.root_mount_always_wait=1. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0' || root_mount_always_wait != 0) { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; + + TSENTER(); td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); + + TSEXIT(); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if( strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); }