Index: head/sys/kern/kern_dump.c =================================================================== --- head/sys/kern/kern_dump.c (revision 298068) +++ head/sys/kern/kern_dump.c (revision 298069) @@ -1,399 +1,399 @@ /*- * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_watchdog.h" #include #include #include #include #include #include #include #ifdef SW_WATCHDOG #include #endif #include #include #include #include #include #include #include CTASSERT(sizeof(struct kerneldumpheader) == 512); /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This * is to protect us from metadata and to protect metadata from us. */ #define SIZEOF_METADATA (64*1024) #define MD_ALIGN(x) (((off_t)(x) + PAGE_MASK) & ~PAGE_MASK) #define DEV_ALIGN(x) (((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1)) off_t dumplo; /* Handle buffered writes. */ static char buffer[DEV_BSIZE]; static size_t fragsz; struct dump_pa dump_map[DUMPSYS_MD_PA_NPAIRS]; #if !defined(__powerpc__) && !defined(__sparc__) void dumpsys_gen_pa_init(void) { int n, idx; bzero(dump_map, sizeof(dump_map)); for (n = 0; n < sizeof(dump_map) / sizeof(dump_map[0]); n++) { idx = n * 2; if (dump_avail[idx] == 0 && dump_avail[idx + 1] == 0) break; dump_map[n].pa_start = dump_avail[idx]; dump_map[n].pa_size = dump_avail[idx + 1] - dump_avail[idx]; } } #endif struct dump_pa * dumpsys_gen_pa_next(struct dump_pa *mdp) { if (mdp == NULL) return (&dump_map[0]); mdp++; if (mdp->pa_size == 0) mdp = NULL; return (mdp); } void dumpsys_gen_wbinv_all(void) { } void dumpsys_gen_unmap_chunk(vm_paddr_t pa __unused, size_t chunk __unused, void *va __unused) { } #if !defined(__sparc__) int dumpsys_gen_write_aux_headers(struct dumperinfo *di) { return (0); } #endif int dumpsys_buf_write(struct dumperinfo *di, char *ptr, size_t sz) { size_t len; int error; while (sz) { len = DEV_BSIZE - fragsz; if (len > sz) len = sz; bcopy(ptr, buffer + fragsz, len); fragsz += len; ptr += len; sz -= len; if (fragsz == DEV_BSIZE) { error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE); if (error) return (error); dumplo += DEV_BSIZE; fragsz = 0; } } return (0); } int dumpsys_buf_flush(struct dumperinfo *di) { int error; if (fragsz == 0) return (0); error = dump_write(di, buffer, 0, dumplo, DEV_BSIZE); dumplo += DEV_BSIZE; fragsz = 0; return (error); } CTASSERT(PAGE_SHIFT < 20); #define PG2MB(pgs) ((pgs + (1 << (20 - PAGE_SHIFT)) - 1) >> (20 - PAGE_SHIFT)) int dumpsys_cb_dumpdata(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; vm_paddr_t pa; void *va; uint64_t pgs; size_t counter, sz, chunk; int c, error; u_int maxdumppgs; error = 0; /* catch case in which chunk size is 0 */ counter = 0; /* Update twiddle every 16MB */ - va = 0; + va = NULL; pgs = mdp->pa_size / PAGE_SIZE; pa = mdp->pa_start; maxdumppgs = min(di->maxiosize / PAGE_SIZE, MAXDUMPPGS); if (maxdumppgs == 0) /* seatbelt */ maxdumppgs = 1; printf(" chunk %d: %juMB (%ju pages)", seqnr, (uintmax_t)PG2MB(pgs), (uintmax_t)pgs); dumpsys_wbinv_all(); while (pgs) { chunk = pgs; if (chunk > maxdumppgs) chunk = maxdumppgs; sz = chunk << PAGE_SHIFT; counter += sz; if (counter >> 24) { printf(" %ju", (uintmax_t)PG2MB(pgs)); counter &= (1 << 24) - 1; } dumpsys_map_chunk(pa, chunk, &va); #ifdef SW_WATCHDOG wdog_kern_pat(WD_LASTVAL); #endif error = dump_write(di, va, 0, dumplo, sz); dumpsys_unmap_chunk(pa, chunk, va); if (error) break; dumplo += sz; pgs -= chunk; pa += sz; /* Check for user abort. */ c = cncheckc(); if (c == 0x03) return (ECANCELED); if (c != -1) printf(" (CTRL-C to abort) "); } printf(" ... %s\n", (error) ? "fail" : "ok"); return (error); } int dumpsys_foreach_chunk(dumpsys_callback_t cb, void *arg) { struct dump_pa *mdp; int error, seqnr; seqnr = 0; mdp = dumpsys_pa_next(NULL); while (mdp != NULL) { error = (*cb)(mdp, seqnr++, arg); if (error) return (-error); mdp = dumpsys_pa_next(mdp); } return (seqnr); } #if !defined(__sparc__) static off_t fileofs; static int cb_dumphdr(struct dump_pa *mdp, int seqnr, void *arg) { struct dumperinfo *di = (struct dumperinfo*)arg; Elf_Phdr phdr; uint64_t size; int error; size = mdp->pa_size; bzero(&phdr, sizeof(phdr)); phdr.p_type = PT_LOAD; phdr.p_flags = PF_R; /* XXX */ phdr.p_offset = fileofs; #ifdef __powerpc__ phdr.p_vaddr = (do_minidump? mdp->pa_start : ~0L); phdr.p_paddr = (do_minidump? ~0L : mdp->pa_start); #else phdr.p_vaddr = mdp->pa_start; phdr.p_paddr = mdp->pa_start; #endif phdr.p_filesz = size; phdr.p_memsz = size; phdr.p_align = PAGE_SIZE; error = dumpsys_buf_write(di, (char*)&phdr, sizeof(phdr)); fileofs += phdr.p_filesz; return (error); } static int cb_size(struct dump_pa *mdp, int seqnr, void *arg) { uint64_t *sz; sz = (uint64_t *)arg; *sz += (uint64_t)mdp->pa_size; return (0); } int dumpsys_generic(struct dumperinfo *di) { static struct kerneldumpheader kdh; Elf_Ehdr ehdr; uint64_t dumpsize; off_t hdrgap; size_t hdrsz; int error; #ifndef __powerpc__ if (do_minidump) return (minidumpsys(di)); #endif bzero(&ehdr, sizeof(ehdr)); ehdr.e_ident[EI_MAG0] = ELFMAG0; ehdr.e_ident[EI_MAG1] = ELFMAG1; ehdr.e_ident[EI_MAG2] = ELFMAG2; ehdr.e_ident[EI_MAG3] = ELFMAG3; ehdr.e_ident[EI_CLASS] = ELF_CLASS; #if BYTE_ORDER == LITTLE_ENDIAN ehdr.e_ident[EI_DATA] = ELFDATA2LSB; #else ehdr.e_ident[EI_DATA] = ELFDATA2MSB; #endif ehdr.e_ident[EI_VERSION] = EV_CURRENT; ehdr.e_ident[EI_OSABI] = ELFOSABI_STANDALONE; /* XXX big picture? */ ehdr.e_type = ET_CORE; ehdr.e_machine = EM_VALUE; ehdr.e_phoff = sizeof(ehdr); ehdr.e_flags = 0; ehdr.e_ehsize = sizeof(ehdr); ehdr.e_phentsize = sizeof(Elf_Phdr); ehdr.e_shentsize = sizeof(Elf_Shdr); dumpsys_pa_init(); /* Calculate dump size. */ dumpsize = 0L; ehdr.e_phnum = dumpsys_foreach_chunk(cb_size, &dumpsize) + DUMPSYS_NUM_AUX_HDRS; hdrsz = ehdr.e_phoff + ehdr.e_phnum * ehdr.e_phentsize; fileofs = MD_ALIGN(hdrsz); dumpsize += fileofs; hdrgap = fileofs - DEV_ALIGN(hdrsz); /* Determine dump offset on device. */ if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) { error = ENOSPC; goto fail; } dumplo = di->mediaoffset + di->mediasize - dumpsize; dumplo -= sizeof(kdh) * 2; mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_ARCH_VERSION, dumpsize, di->blocksize); printf("Dumping %ju MB (%d chunks)\n", (uintmax_t)dumpsize >> 20, ehdr.e_phnum - DUMPSYS_NUM_AUX_HDRS); /* Dump leader */ error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh)); if (error) goto fail; dumplo += sizeof(kdh); /* Dump ELF header */ error = dumpsys_buf_write(di, (char*)&ehdr, sizeof(ehdr)); if (error) goto fail; /* Dump program headers */ error = dumpsys_foreach_chunk(cb_dumphdr, di); if (error < 0) goto fail; error = dumpsys_write_aux_headers(di); if (error < 0) goto fail; dumpsys_buf_flush(di); /* * All headers are written using blocked I/O, so we know the * current offset is (still) block aligned. Skip the alignement * in the file to have the segment contents aligned at page * boundary. We cannot use MD_ALIGN on dumplo, because we don't * care and may very well be unaligned within the dump device. */ dumplo += hdrgap; /* Dump memory chunks (updates dumplo) */ error = dumpsys_foreach_chunk(dumpsys_cb_dumpdata, di); if (error < 0) goto fail; /* Dump trailer */ error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh)); if (error) goto fail; /* Signal completion, signoff and exit stage left. */ dump_write(di, NULL, 0, 0, 0); printf("\nDump complete\n"); return (0); fail: if (error < 0) error = -error; if (error == ECANCELED) printf("\nDump aborted\n"); else if (error == ENOSPC) printf("\nDump failed. Partition too small.\n"); else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); } #endif Index: head/sys/kern/kern_kthread.c =================================================================== --- head/sys/kern/kern_kthread.c (revision 298068) +++ head/sys/kern/kern_kthread.c (revision 298069) @@ -1,470 +1,470 @@ /*- * Copyright (c) 1999 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Start a kernel process. This is called after a fork() call in * mi_startup() in the file kern/init_main.c. * * This function is used to start "internal" daemons and intended * to be called from SYSINIT(). */ void kproc_start(const void *udata) { const struct kproc_desc *kp = udata; int error; error = kproc_create((void (*)(void *))kp->func, NULL, kp->global_procpp, 0, 0, "%s", kp->arg0); if (error) panic("kproc_start: %s: error %d", kp->arg0, error); } /* * Create a kernel process/thread/whatever. It shares its address space * with proc0 - ie: kernel only. * * func is the function to start. * arg is the parameter to pass to function on first startup. * newpp is the return value pointing to the thread's struct proc. * flags are flags to fork1 (in unistd.h) * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.). */ int kproc_create(void (*func)(void *), void *arg, struct proc **newpp, int flags, int pages, const char *fmt, ...) { struct fork_req fr; int error; va_list ap; struct thread *td; struct proc *p2; if (!proc0.p_stats) panic("kproc_create called too soon"); bzero(&fr, sizeof(fr)); fr.fr_flags = RFMEM | RFFDG | RFPROC | RFSTOPPED | flags; fr.fr_pages = pages; fr.fr_procp = &p2; error = fork1(&thread0, &fr); if (error) return error; /* save a global descriptor, if desired */ if (newpp != NULL) *newpp = p2; /* this is a non-swapped system process */ PROC_LOCK(p2); td = FIRST_THREAD_IN_PROC(p2); p2->p_flag |= P_SYSTEM | P_KPROC; td->td_pflags |= TDP_KTHREAD; mtx_lock(&p2->p_sigacts->ps_mtx); p2->p_sigacts->ps_flag |= PS_NOCLDWAIT; mtx_unlock(&p2->p_sigacts->ps_mtx); PROC_UNLOCK(p2); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap); va_end(ap); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap); va_end(ap); #ifdef KTR sched_clear_tdname(td); #endif /* call the processes' main()... */ cpu_set_fork_handler(td, func, arg); /* Avoid inheriting affinity from a random parent. */ cpuset_setthread(td->td_tid, cpuset_root); thread_lock(td); TD_SET_CAN_RUN(td); sched_prio(td, PVM); sched_user_prio(td, PUSER); /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) sched_add(td, SRQ_BORING); thread_unlock(td); return 0; } void kproc_exit(int ecode) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; /* * Reparent curthread from proc0 to init so that the zombie * is harvested. */ sx_xlock(&proctree_lock); PROC_LOCK(p); proc_reparent(p, initproc); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); /* * Wakeup anyone waiting for us to exit. */ wakeup(p); /* Buh-bye! */ exit1(td, ecode, 0); } /* * Advise a kernel process to suspend (or resume) in its main loop. * Participation is voluntary. */ int kproc_suspend(struct proc *p, int timo) { /* * Make sure this is indeed a system process and we can safely * use the p_siglist field. */ PROC_LOCK(p); if ((p->p_flag & P_KPROC) == 0) { PROC_UNLOCK(p); return (EINVAL); } SIGADDSET(p->p_siglist, SIGSTOP); wakeup(p); return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkp", timo); } int kproc_resume(struct proc *p) { /* * Make sure this is indeed a system process and we can safely * use the p_siglist field. */ PROC_LOCK(p); if ((p->p_flag & P_KPROC) == 0) { PROC_UNLOCK(p); return (EINVAL); } SIGDELSET(p->p_siglist, SIGSTOP); PROC_UNLOCK(p); wakeup(&p->p_siglist); return (0); } void kproc_suspend_check(struct proc *p) { PROC_LOCK(p); while (SIGISMEMBER(p->p_siglist, SIGSTOP)) { wakeup(&p->p_siglist); msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "kpsusp", 0); } PROC_UNLOCK(p); } /* * Start a kernel thread. * * This function is used to start "internal" daemons and intended * to be called from SYSINIT(). */ void kthread_start(const void *udata) { const struct kthread_desc *kp = udata; int error; error = kthread_add((void (*)(void *))kp->func, NULL, NULL, kp->global_threadpp, 0, 0, "%s", kp->arg0); if (error) panic("kthread_start: %s: error %d", kp->arg0, error); } /* * Create a kernel thread. It shares its address space * with proc0 - ie: kernel only. * * func is the function to start. * arg is the parameter to pass to function on first startup. * newtdp is the return value pointing to the thread's struct thread. * ** XXX fix this --> flags are flags to fork1 (in unistd.h) * fmt and following will be *printf'd into (*newtd)->td_name (for ps, etc.). */ int kthread_add(void (*func)(void *), void *arg, struct proc *p, struct thread **newtdp, int flags, int pages, const char *fmt, ...) { va_list ap; struct thread *newtd, *oldtd; if (!proc0.p_stats) panic("kthread_add called too soon"); /* If no process supplied, put it on proc0 */ if (p == NULL) p = &proc0; /* Initialize our new td */ newtd = thread_alloc(pages); if (newtd == NULL) return (ENOMEM); PROC_LOCK(p); oldtd = FIRST_THREAD_IN_PROC(p); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&oldtd->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); /* set up arg0 for 'ps', et al */ va_start(ap, fmt); vsnprintf(newtd->td_name, sizeof(newtd->td_name), fmt, ap); va_end(ap); newtd->td_proc = p; /* needed for cpu_set_upcall */ /* XXX optimise this probably? */ /* On x86 (and probably the others too) it is way too full of junk */ /* Needs a better name */ cpu_set_upcall(newtd, oldtd); /* put the designated function(arg) as the resume context */ cpu_set_fork_handler(newtd, func, arg); newtd->td_pflags |= TDP_KTHREAD; thread_cow_get_proc(newtd, p); /* this code almost the same as create_thread() in kern_thr.c */ p->p_flag |= P_HADTHREADS; thread_link(newtd, p); thread_lock(oldtd); /* let the scheduler know about these things. */ sched_fork_thread(oldtd, newtd); TD_SET_CAN_RUN(newtd); thread_unlock(oldtd); PROC_UNLOCK(p); tidhash_add(newtd); /* Avoid inheriting affinity from a random parent. */ cpuset_setthread(newtd->td_tid, cpuset_root); /* Delay putting it on the run queue until now. */ if (!(flags & RFSTOPPED)) { thread_lock(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); } if (newtdp) *newtdp = newtd; return 0; } void kthread_exit(void) { struct proc *p; p = curthread->td_proc; /* A module may be waiting for us to exit. */ wakeup(curthread); /* * The last exiting thread in a kernel process must tear down * the whole process. */ rw_wlock(&tidhash_lock); PROC_LOCK(p); if (p->p_numthreads == 1) { PROC_UNLOCK(p); rw_wunlock(&tidhash_lock); kproc_exit(0); } LIST_REMOVE(curthread, td_hash); rw_wunlock(&tidhash_lock); umtx_thread_exit(curthread); PROC_SLOCK(p); thread_exit(); } /* * Advise a kernel process to suspend (or resume) in its main loop. * Participation is voluntary. */ int kthread_suspend(struct thread *td, int timo) { struct proc *p; p = td->td_proc; /* * td_pflags should not be read by any thread other than * curthread, but as long as this flag is invariant during the * thread's lifetime, it is OK to check its state. */ if ((td->td_pflags & TDP_KTHREAD) == 0) return (EINVAL); /* * The caller of the primitive should have already checked that the * thread is up and running, thus not being blocked by other * conditions. */ PROC_LOCK(p); thread_lock(td); td->td_flags |= TDF_KTH_SUSP; thread_unlock(td); return (msleep(&td->td_flags, &p->p_mtx, PPAUSE | PDROP, "suspkt", timo)); } /* * Resume a thread previously put asleep with kthread_suspend(). */ int kthread_resume(struct thread *td) { struct proc *p; p = td->td_proc; /* * td_pflags should not be read by any thread other than * curthread, but as long as this flag is invariant during the * thread's lifetime, it is OK to check its state. */ if ((td->td_pflags & TDP_KTHREAD) == 0) return (EINVAL); PROC_LOCK(p); thread_lock(td); td->td_flags &= ~TDF_KTH_SUSP; thread_unlock(td); wakeup(&td->td_flags); PROC_UNLOCK(p); return (0); } /* * Used by the thread to poll as to whether it should yield/sleep * and notify the caller that is has happened. */ void kthread_suspend_check(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; if ((td->td_pflags & TDP_KTHREAD) == 0) panic("%s: curthread is not a valid kthread", __func__); /* * As long as the double-lock protection is used when accessing the * TDF_KTH_SUSP flag, synchronizing the read operation via proc mutex * is fine. */ PROC_LOCK(p); while (td->td_flags & TDF_KTH_SUSP) { wakeup(&td->td_flags); msleep(&td->td_flags, &p->p_mtx, PPAUSE, "ktsusp", 0); } PROC_UNLOCK(p); } int kproc_kthread_add(void (*func)(void *), void *arg, struct proc **procptr, struct thread **tdptr, int flags, int pages, const char *procname, const char *fmt, ...) { int error; va_list ap; char buf[100]; struct thread *td; - if (*procptr == 0) { + if (*procptr == NULL) { error = kproc_create(func, arg, procptr, flags, pages, "%s", procname); if (error) return (error); td = FIRST_THREAD_IN_PROC(*procptr); if (tdptr) *tdptr = td; va_start(ap, fmt); vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap); va_end(ap); #ifdef KTR sched_clear_tdname(td); #endif return (0); } va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); error = kthread_add(func, arg, *procptr, tdptr, flags, pages, "%s", buf); return (error); } Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 298068) +++ head/sys/kern/kern_proc.c (revision 298069) @@ -1,3115 +1,3115 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #endif SDT_PROVIDER_DEFINE(proc); SDT_PROBE_DEFINE4(proc, , ctor, entry, "struct proc *", "int", "void *", "int"); SDT_PROBE_DEFINE4(proc, , ctor, return, "struct proc *", "int", "void *", "int"); SDT_PROBE_DEFINE4(proc, , dtor, entry, "struct proc *", "int", "void *", "struct thread *"); SDT_PROBE_DEFINE3(proc, , dtor, return, "struct proc *", "int", "void *"); SDT_PROBE_DEFINE3(proc, , init, entry, "struct proc *", "int", "int"); SDT_PROBE_DEFINE3(proc, , init, return, "struct proc *", "int", "int"); MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); static void pargs_free(struct pargs *pa); static struct proc *zpfind_locked(pid_t pid); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; /* * The offset of various fields in struct proc and struct thread. * These are used by kernel debuggers to enumerate kernel threads and * processes. */ const int proc_off_p_pid = offsetof(struct proc, p_pid); const int proc_off_p_comm = offsetof(struct proc, p_comm); const int proc_off_p_list = offsetof(struct proc, p_list); const int proc_off_p_threads = offsetof(struct proc, p_threads); const int thread_off_td_tid = offsetof(struct thread, td_tid); const int thread_off_td_name = offsetof(struct thread, td_name); const int thread_off_td_oncpu = offsetof(struct thread, td_oncpu); const int thread_off_td_pcb = offsetof(struct thread, td_pcb); const int thread_off_td_plist = offsetof(struct thread, td_plist); int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "Kernel stack size in pages"); static int vmmap_skip_res_cnt = 0; SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW, &vmmap_skip_res_cnt, 0, "Skip calculation of the pages resident count in kern.proc.vmmap"); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #ifdef COMPAT_FREEBSD32 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE); #endif /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; p = (struct proc *)mem; SDT_PROBE4(proc, , ctor , entry, p, size, arg, flags); EVENTHANDLER_INVOKE(process_ctor, p); SDT_PROBE4(proc, , ctor , return, p, size, arg, flags); return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); SDT_PROBE4(proc, , dtor, entry, p, size, arg, td); if (td != NULL) { #ifdef INVARIANTS KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); } EVENTHANDLER_INVOKE(process_dtor, p); if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); SDT_PROBE3(proc, , dtor, return, p, size, arg); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; p = (struct proc *)mem; SDT_PROBE3(proc, , init, entry, p, size, flags); p->p_sched = (struct p_sched *)&p[1]; mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW); cv_init(&p->p_pwait, "ppwait"); cv_init(&p->p_dbgwait, "dbgwait"); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_INVOKE(process_init, p); p->p_stats = pstats_alloc(); p->p_pgrp = NULL; SDT_PROBE3(proc, , init, return, p, size, flags); return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; EVENTHANDLER_INVOKE(process_fini, p); pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } /* * Is p an inferior of the current process? */ int inferior(struct proc *p) { sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) return (0); } return (1); } struct proc * pfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); p = NULL; } break; } } return (p); } /* * Locate a process by number; return only "live" processes -- i.e., neither * zombies nor newly born but incompletely initialized processes. By not * returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ struct proc * pfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = pfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } static struct proc * pfind_tid_locked(pid_t tid) { struct proc *p; struct thread *td; sx_assert(&allproc_lock, SX_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) goto found; } PROC_UNLOCK(p); } found: return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Locate process and do additional manipulations, depending on flags. */ int pget(pid_t pid, int flags, struct proc **pp) { struct proc *p; int error; sx_slock(&allproc_lock); if (pid <= PID_MAX) { p = pfind_locked(pid); if (p == NULL && (flags & PGET_NOTWEXIT) == 0) p = zpfind_locked(pid); } else if ((flags & PGET_NOTID) == 0) { p = pfind_tid_locked(pid); } else { p = NULL; } sx_sunlock(&allproc_lock); if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { error = p_cansee(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_CANDEBUG) != 0) { error = p_candebug(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_ISCURRENT) != 0 && curproc != p) { error = EPERM; goto errout; } if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto errout; } if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) { /* * XXXRW: Not clear ESRCH is the right error during proc * execve(). */ error = ESRCH; goto errout; } if ((flags & PGET_HOLD) != 0) { _PHOLD(p); PROC_UNLOCK(p); } *pp = p; return (0); errout: PROC_UNLOCK(p); return (error); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); KASSERT(pgfind(pgid) == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; refcount_init(&sess->s_count, 1); sess->s_ttyvp = NULL; sess->s_ttydp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; sess_hold(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; struct tty *tp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); PGRP_LOCK(pgrp); tp = pgrp->pg_session->s_ttyp; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; PGRP_UNLOCK(pgrp); /* Remove the reference to the pgrp before deallocating it. */ if (tp != NULL) { tty_lock(tp); tty_rel_pgrp(tp, pgrp); } mtx_destroy(&pgrp->pg_mtx); free(pgrp, M_PGRP); sess_release(savesess); } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(struct proc *p, struct pgrp *pgrp, int entering) { struct pgrp *hispgrp; struct session *mysession; struct proc *q; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(q, &p->p_children, p_sibling) { hispgrp = q->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; if (q->p_state == PRS_ZOMBIE) continue; pgadjustjobc(hispgrp, entering); } } void killjobc(void) { struct session *sp; struct tty *tp; struct proc *p; struct vnode *ttyvp; p = curproc; MPASS(p->p_flag & P_WEXIT); /* * Do a quick check to see if there is anything to do with the * proctree_lock held. pgrp and LIST_EMPTY checks are for fixjobc(). */ PROC_LOCK(p); if (!SESS_LEADER(p) && (p->p_pgrp == p->p_pptr->p_pgrp) && LIST_EMPTY(&p->p_children)) { PROC_UNLOCK(p); return; } PROC_UNLOCK(p); sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { sp = p->p_session; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_ttydp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } vrele(ttyvp); sx_xlock(&proctree_lock); } } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p) == P_STOPPED_SIG) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); kern_psignal(p, SIGHUP); kern_psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sess_hold(struct session *s) { refcount_acquire(&s->s_count); } void sess_release(struct session *s) { if (refcount_release(&s->s_count)) { if (s->s_ttyp != NULL) { tty_lock(s->s_ttyp); tty_rel_sess(s->s_ttyp, s); } mtx_destroy(&s->s_mtx); free(s, M_SESSION); } } #ifdef DDB DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Calculate the kinfo_proc members which contain process-wide * informations. * Must be called with the target process locked. */ static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_estcpu = 0; kp->ki_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); kp->ki_pctcpu += sched_pctcpu(td); kp->ki_estcpu += td->td_estcpu; thread_unlock(td); } } /* * Clear kinfo_proc and fill in any information that is common * to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct tty *tp; struct session *sp; struct ucred *cred; struct sigacts *ps; /* For proc_realparent. */ sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; kp->ki_addr =/* p->p_addr; */0; /* XXX */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; kp->ki_traceflag = p->p_traceflag; #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; kp->ki_flag2 = p->p_flag2; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; kp->ki_cr_flags = 0; if (cred->cr_flags & CRED_FLAG_CAPMODE) kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE; /* XXX bde doesn't like KI_NGROUPS */ if (cred->cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = cred->cr_ngroups; bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside the jail, use 0 as a jail ID. */ if (cred->cr_prison != curthread->td_ucred->cr_prison) kp->ki_jid = cred->cr_prison->pr_id; } strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name, sizeof(kp->ki_loginclass)); } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; kp->ki_fibnum = p->p_fibnum; kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); PROC_STATLOCK(p); rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_STATUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child times in a single value. */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); FOREACH_THREAD_IN_PROC(p, td0) kp->ki_cow += td0->td_cow; tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; /* XXX proctree_lock */ tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = tty_udev(tp); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NODEV; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) { kp->ki_ppid = proc_realparent(p)->p_pid; if (p->p_flag & P_TRACED) kp->ki_tracer = p->p_pptr->p_pid; } } /* * Fill in information that is thread specific. Must be called with * target process locked. If 'preferthread' is set, overwrite certain * process-related fields that are maintained for both threads and * processes. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread) { struct proc *p; p = td->td_proc; kp->ki_tdaddr = td; PROC_LOCK_ASSERT(p, MA_OWNED); if (preferthread) PROC_STATLOCK(p); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname)); if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* approximate. */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; /* * Note: legacy fields; clamp at the old NOCPU value and/or * the maximum u_char CPU value. */ if (td->td_lastcpu == NOCPU) kp->ki_lastcpu_old = NOCPU_OLD; else if (td->td_lastcpu > MAXCPU_OLD) kp->ki_lastcpu_old = MAXCPU_OLD; else kp->ki_lastcpu_old = td->td_lastcpu; if (td->td_oncpu == NOCPU) kp->ki_oncpu_old = NOCPU_OLD; else if (td->td_oncpu > MAXCPU_OLD) kp->ki_oncpu_old = MAXCPU_OLD; else kp->ki_oncpu_old = td->td_oncpu; kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; if (preferthread) { rufetchtd(td, &kp->ki_rusage); kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime); kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = td->td_estcpu; kp->ki_cow = td->td_cow; } /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; if (preferthread) kp->ki_siglist = td->td_siglist; kp->ki_sigmask = td->td_sigmask; thread_unlock(td); if (preferthread) PROC_STATUNLOCK(p); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { MPASS(FIRST_THREAD_IN_PROC(p) != NULL); fill_kinfo_proc_only(p, kp); fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0); fill_kinfo_aggregate(p, kp); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } static struct proc * zpfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, &zombproc, p_list) { if (p->p_pid == pid) { PROC_LOCK(p); break; } } return (p); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = zpfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } #ifdef COMPAT_FREEBSD32 /* * This function is typically used to copy out the kernel address, so * it can be replaced by assignment of zero. */ static inline uint32_t ptr32_trim(void *ptr) { uintptr_t uptr; uptr = (uintptr_t)ptr; return ((uptr > UINT_MAX) ? 0 : uptr); } #define PTRTRIM_CP(src,dst,fld) \ do { (dst).fld = ptr32_trim((src).fld); } while (0) static void freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32) { int i; bzero(ki32, sizeof(struct kinfo_proc32)); ki32->ki_structsize = sizeof(struct kinfo_proc32); CP(*ki, *ki32, ki_layout); PTRTRIM_CP(*ki, *ki32, ki_args); PTRTRIM_CP(*ki, *ki32, ki_paddr); PTRTRIM_CP(*ki, *ki32, ki_addr); PTRTRIM_CP(*ki, *ki32, ki_tracep); PTRTRIM_CP(*ki, *ki32, ki_textvp); PTRTRIM_CP(*ki, *ki32, ki_fd); PTRTRIM_CP(*ki, *ki32, ki_vmspace); PTRTRIM_CP(*ki, *ki32, ki_wchan); CP(*ki, *ki32, ki_pid); CP(*ki, *ki32, ki_ppid); CP(*ki, *ki32, ki_pgid); CP(*ki, *ki32, ki_tpgid); CP(*ki, *ki32, ki_sid); CP(*ki, *ki32, ki_tsid); CP(*ki, *ki32, ki_jobc); CP(*ki, *ki32, ki_tdev); CP(*ki, *ki32, ki_siglist); CP(*ki, *ki32, ki_sigmask); CP(*ki, *ki32, ki_sigignore); CP(*ki, *ki32, ki_sigcatch); CP(*ki, *ki32, ki_uid); CP(*ki, *ki32, ki_ruid); CP(*ki, *ki32, ki_svuid); CP(*ki, *ki32, ki_rgid); CP(*ki, *ki32, ki_svgid); CP(*ki, *ki32, ki_ngroups); for (i = 0; i < KI_NGROUPS; i++) CP(*ki, *ki32, ki_groups[i]); CP(*ki, *ki32, ki_size); CP(*ki, *ki32, ki_rssize); CP(*ki, *ki32, ki_swrss); CP(*ki, *ki32, ki_tsize); CP(*ki, *ki32, ki_dsize); CP(*ki, *ki32, ki_ssize); CP(*ki, *ki32, ki_xstat); CP(*ki, *ki32, ki_acflag); CP(*ki, *ki32, ki_pctcpu); CP(*ki, *ki32, ki_estcpu); CP(*ki, *ki32, ki_slptime); CP(*ki, *ki32, ki_swtime); CP(*ki, *ki32, ki_cow); CP(*ki, *ki32, ki_runtime); TV_CP(*ki, *ki32, ki_start); TV_CP(*ki, *ki32, ki_childtime); CP(*ki, *ki32, ki_flag); CP(*ki, *ki32, ki_kiflag); CP(*ki, *ki32, ki_traceflag); CP(*ki, *ki32, ki_stat); CP(*ki, *ki32, ki_nice); CP(*ki, *ki32, ki_lock); CP(*ki, *ki32, ki_rqindex); CP(*ki, *ki32, ki_oncpu); CP(*ki, *ki32, ki_lastcpu); /* XXX TODO: wrap cpu value as appropriate */ CP(*ki, *ki32, ki_oncpu_old); CP(*ki, *ki32, ki_lastcpu_old); bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1); bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1); bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1); bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1); bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1); bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1); bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1); CP(*ki, *ki32, ki_tracer); CP(*ki, *ki32, ki_flag2); CP(*ki, *ki32, ki_fibnum); CP(*ki, *ki32, ki_cr_flags); CP(*ki, *ki32, ki_jid); CP(*ki, *ki32, ki_numthreads); CP(*ki, *ki32, ki_tid); CP(*ki, *ki32, ki_pri); freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage); freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch); PTRTRIM_CP(*ki, *ki32, ki_pcb); PTRTRIM_CP(*ki, *ki32, ki_kstack); PTRTRIM_CP(*ki, *ki32, ki_udata); CP(*ki, *ki32, ki_sflag); CP(*ki, *ki32, ki_tdflags); } #endif int kern_proc_out(struct proc *p, struct sbuf *sb, int flags) { struct thread *td; struct kinfo_proc ki; #ifdef COMPAT_FREEBSD32 struct kinfo_proc32 ki32; #endif int error; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(FIRST_THREAD_IN_PROC(p) != NULL); error = 0; fill_kinfo_proc(p, &ki); if ((flags & KERN_PROC_NOTHREADS) != 0) { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; } else { FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &ki, 1); #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; if (error != 0) break; } } PROC_UNLOCK(p); return (error); } static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags, int doingzomb) { struct sbuf sb; struct kinfo_proc ki; struct proc *np; int error, error2; pid_t pid; pid = p->p_pid; sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = kern_proc_out(p, &sb, flags); error2 = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0) return (error); else if (error2 != 0) return (error2); if (doingzomb) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return (ESRCH); if (np != p) { PROC_UNLOCK(np); return (ESRCH); } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) flags |= KERN_PROC_MASK32; #endif if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); sx_slock(&proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags, 0); sx_sunlock(&proctree_lock); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&proctree_lock); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); - for (; p != 0; p = LIST_NEXT(p, p_list)) { + for (; p != NULL; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } /* XXX proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags, doingzomb); if (error) { sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (error); } } } sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; pa = malloc(sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } static void pargs_free(struct pargs *pa) { free(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } static int proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf, size_t len) { ssize_t n; /* * This may return a short read if the string is shorter than the chunk * and is aligned at the end of the page, and the following page is not * mapped. */ n = proc_readmem(td, p, (vm_offset_t)sptr, buf, len); if (n <= 0) return (ENOMEM); return (0); } #define PROC_AUXV_MAX 256 /* Safety limit on auxv size. */ enum proc_vector_type { PROC_ARG, PROC_ENV, PROC_AUX, }; #ifdef COMPAT_FREEBSD32 static int get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct freebsd32_ps_strings pss; Elf32_Auxinfo aux; vm_offset_t vptr, ptr; uint32_t *proc_vector32; char **proc_vector; size_t vsize, size; int i, error; error = 0; if (proc_readmem(td, p, (vm_offset_t)p->p_sysent->sv_psstrings, &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)PTRIN(pss.ps_argvstr); vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_ENV: vptr = (vm_offset_t)PTRIN(pss.ps_envstr); vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_AUX: vptr = (vm_offset_t)PTRIN(pss.ps_envstr) + (pss.ps_nenvstr + 1) * sizeof(int32_t); if (vptr % 4 != 0) return (ENOEXEC); for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); } proc_vector32 = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector32, size) != size) { error = ENOMEM; goto done; } if (type == PROC_AUX) { *proc_vectorp = (char **)proc_vector32; *vsizep = vsize; return (0); } proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK); for (i = 0; i < (int)vsize; i++) proc_vector[i] = PTRIN(proc_vector32[i]); *proc_vectorp = proc_vector; *vsizep = vsize; done: free(proc_vector32, M_TEMP); return (error); } #endif static int get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct ps_strings pss; Elf_Auxinfo aux; vm_offset_t vptr, ptr; char **proc_vector; size_t vsize, size; int i; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) return (get_proc_vector32(td, p, proc_vectorp, vsizep, type)); #endif if (proc_readmem(td, p, (vm_offset_t)p->p_sysent->sv_psstrings, &pss, sizeof(pss)) != sizeof(pss)) return (ENOMEM); switch (type) { case PROC_ARG: vptr = (vm_offset_t)pss.ps_argvstr; vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_ENV: vptr = (vm_offset_t)pss.ps_envstr; vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_AUX: /* * The aux array is just above env array on the stack. Check * that the address is naturally aligned. */ vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1) * sizeof(char *); #if __ELF_WORD_SIZE == 64 if (vptr % sizeof(uint64_t) != 0) #else if (vptr % sizeof(uint32_t) != 0) #endif return (ENOEXEC); /* * We count the array size reading the aux vectors from the * stack until AT_NULL vector is returned. So (to keep the code * simple) we read the process stack twice: the first time here * to find the size and the second time when copying the vectors * to the allocated proc_vector. */ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { if (proc_readmem(td, p, ptr, &aux, sizeof(aux)) != sizeof(aux)) return (ENOMEM); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } /* * If the PROC_AUXV_MAX entries are iterated over, and we have * not reached AT_NULL, it is most likely we are reading wrong * data: either the process doesn't have auxv array or data has * been modified. Return the error in this case. */ if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); /* In case we are built without INVARIANTS. */ } proc_vector = malloc(size, M_TEMP, M_WAITOK); if (proc_readmem(td, p, vptr, proc_vector, size) != size) { free(proc_vector, M_TEMP); return (ENOMEM); } *proc_vectorp = proc_vector; *vsizep = vsize; return (0); } #define GET_PS_STRINGS_CHUNK_SZ 256 /* Chunk size (bytes) for ps_strings operations. */ static int get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb, enum proc_vector_type type) { size_t done, len, nchr, vsize; int error, i; char **proc_vector, *sptr; char pss_string[GET_PS_STRINGS_CHUNK_SZ]; PROC_ASSERT_HELD(p); /* * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes. */ nchr = 2 * (PATH_MAX + ARG_MAX); error = get_proc_vector(td, p, &proc_vector, &vsize, type); if (error != 0) return (error); for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) { /* * The program may have scribbled into its argv array, e.g. to * remove some arguments. If that has happened, break out * before trying to read from NULL. */ if (proc_vector[i] == NULL) break; for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) { error = proc_read_string(td, p, sptr, pss_string, sizeof(pss_string)); if (error != 0) goto done; len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ); if (done + len >= nchr) len = nchr - done - 1; sbuf_bcat(sb, pss_string, len); if (len != GET_PS_STRINGS_CHUNK_SZ) break; done += GET_PS_STRINGS_CHUNK_SZ; } sbuf_bcat(sb, "", 1); done += len + 1; } done: free(proc_vector, M_TEMP); return (error); } int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ARG)); } int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ENV)); } int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb) { size_t vsize, size; char **auxv; int error; error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX); if (error == 0) { #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) size = vsize * sizeof(Elf32_Auxinfo); else #endif size = vsize * sizeof(Elf_Auxinfo); if (sbuf_bcat(sb, auxv, size) != 0) error = ENOMEM; free(auxv, M_TEMP); } return (error); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; struct sbuf sb; int flags, error = 0, error2; if (namelen != 1) return (EINVAL); flags = PGET_CANSEE; if (req->newptr != NULL) flags |= PGET_ISCURRENT; error = pget((pid_t)name[0], flags, &p); if (error) return (error); pa = p->p_args; if (pa != NULL) { pargs_hold(pa); PROC_UNLOCK(p); error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); } else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) { _PHOLD(p); PROC_UNLOCK(p); sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getargv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); if (error == 0 && error2 != 0) error = error2; } else { PROC_UNLOCK(p); } if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve environment of another process. */ static int sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getenvv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve ELF auxiliary vector of * another process. */ static int sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = proc_getauxv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct vnode *vp; char *retbuf, *freebuf; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } vp = p->p_textvp; if (vp == NULL) { if (*pidp != -1) PROC_UNLOCK(p); return (0); } vref(vp); if (*pidp != -1) PROC_UNLOCK(p); error = vn_fullpath(req->td, vp, &retbuf, &freebuf); vrele(vp); if (error) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error != 0) return (error); sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } #ifdef KINFO_OVMENTRY_SIZE CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE); #endif #ifdef COMPAT_FREEBSD7 static int sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS) { vm_map_entry_t entry, tmp_entry; unsigned int last_timestamp; char *fullpath, *freepath; struct kinfo_ovmentry *kve; struct vattr va; struct ucred *cred; int error, *name; struct vnode *vp; struct proc *p; vm_map_t map; struct vmspace *vm; name = (int *)arg1; error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; vm_offset_t addr; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_structsize = sizeof(*kve); kve->kve_private_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; } kve->kve_resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) kve->kve_resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } kve->kve_start = (void*)entry->start; kve->kve_end = (void*)entry->end; kve->kve_offset = (off_t)entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; last_timestamp = map->timestamp; vm_map_unlock_read(map); kve->kve_fileid = 0; kve->kve_fsid = 0; freepath = NULL; fullpath = ""; if (lobj) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: if ((lobj->flags & OBJ_TMPFS_NODE) != 0) { kve->kve_type = KVME_TYPE_VNODE; if ((lobj->flags & OBJ_TMPFS) != 0) { vp = lobj->un_pager.swp.swp_tmpfs; vref(vp); } } else { kve->kve_type = KVME_TYPE_SWAP; } break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_fileid = va.va_fileid; kve->kve_fsid = va.va_fsid; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); error = SYSCTL_OUT(req, kve, sizeof(*kve)); vm_map_lock_read(map); if (error) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } #endif /* COMPAT_FREEBSD7 */ #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t entry, struct kinfo_vmentry *kve) { vm_object_t obj, tobj; vm_page_t m, m_adv; vm_offset_t addr; vm_paddr_t locked_pa; vm_pindex_t pi, pi_adv, pindex; locked_pa = 0; obj = entry->object.vm_object; addr = entry->start; m_adv = NULL; pi = OFF_TO_IDX(entry->offset); for (; addr < entry->end; addr += IDX_TO_OFF(pi_adv), pi += pi_adv) { if (m_adv != NULL) { m = m_adv; } else { pi_adv = OFF_TO_IDX(entry->end - addr); pindex = pi; for (tobj = obj;; tobj = tobj->backing_object) { m = vm_page_find_least(tobj, pindex); if (m != NULL) { if (m->pindex == pindex) break; if (pi_adv > m->pindex - pindex) { pi_adv = m->pindex - pindex; m_adv = m; } } if (tobj->backing_object == NULL) goto next; pindex += OFF_TO_IDX(tobj-> backing_object_offset); } } m_adv = NULL; if (m->psind != 0 && addr + pagesizes[1] <= entry->end && (addr & (pagesizes[1] - 1)) == 0 && (pmap_mincore(map->pmap, addr, &locked_pa) & MINCORE_SUPER) != 0) { kve->kve_flags |= KVME_FLAG_SUPER; pi_adv = OFF_TO_IDX(pagesizes[1]); } else { /* * We do not test the found page on validity. * Either the page is busy and being paged in, * or it was invalidated. The first case * should be counted as resident, the second * is not so clear; we do account both. */ pi_adv = 1; } kve->kve_resident += pi_adv; next:; } PA_UNLOCK_COND(locked_pa); } /* * Must be called with the process locked and will return unlocked. */ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { vm_map_entry_t entry, tmp_entry; struct vattr va; vm_map_t map; vm_object_t obj, tobj, lobj; char *fullpath, *freepath; struct kinfo_vmentry *kve; struct ucred *cred; struct vnode *vp; struct vmspace *vm; vm_offset_t addr; unsigned int last_timestamp; int error; PROC_LOCK_ASSERT(p, MA_OWNED); _PHOLD(p); PROC_UNLOCK(p); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK | M_ZERO); error = 0; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; addr = entry->end; bzero(kve, sizeof(*kve)); obj = entry->object.vm_object; if (obj != NULL) { for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); lobj = tobj; } if (obj->backing_object == NULL) kve->kve_private_resident = obj->resident_page_count; if (!vmmap_skip_res_cnt) kern_proc_vmmap_resident(map, entry, kve); for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj && tobj != lobj) VM_OBJECT_RUNLOCK(tobj); } } else { lobj = NULL; } kve->kve_start = entry->start; kve->kve_end = entry->end; kve->kve_offset = entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; if (entry->eflags & MAP_ENTRY_GROWS_UP) kve->kve_flags |= KVME_FLAG_GROWS_UP; if (entry->eflags & MAP_ENTRY_GROWS_DOWN) kve->kve_flags |= KVME_FLAG_GROWS_DOWN; last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = ""; if (lobj != NULL) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: if ((lobj->flags & OBJ_TMPFS_NODE) != 0) { kve->kve_type = KVME_TYPE_VNODE; if ((lobj->flags & OBJ_TMPFS) != 0) { vp = lobj->un_pager.swp.swp_tmpfs; vref(vp); } } else { kve->kve_type = KVME_TYPE_SWAP; } break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kve->kve_type = KVME_TYPE_MGTDEVICE; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); kve->kve_vn_type = vntype_to_kinfo(vp->v_type); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_vn_fileid = va.va_fileid; kve->kve_vn_fsid = va.va_fsid; kve->kve_vn_mode = MAKEIMODE(va.va_type, va.va_mode); kve->kve_vn_size = va.va_size; kve->kve_vn_rdev = va.va_rdev; kve->kve_status = KF_ATTR_VALID; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ if ((flags & KERN_VMMAP_PACK_KINFO) != 0) kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) + strlen(kve->kve_path) + 1; else kve->kve_structsize = sizeof(*kve); kve->kve_structsize = roundup(kve->kve_structsize, sizeof(uint64_t)); /* Halt filling and truncate rather than exceeding maxlen */ if (maxlen != -1 && maxlen < kve->kve_structsize) { error = 0; vm_map_lock_read(map); break; } else if (maxlen != -1) maxlen -= kve->kve_structsize; if (sbuf_bcat(sb, kve, kve->kve_structsize) != 0) error = ENOMEM; vm_map_lock_read(map); if (error != 0) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } static int sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS) { struct proc *p; struct sbuf sb; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } error = kern_proc_vmmap_out(p, &sb, -1, KERN_VMMAP_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #if defined(STACK) || defined(DDB) static int sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS) { struct kinfo_kstack *kkstp; int error, i, *name, numthreads; lwpid_t *lwpidarray; struct thread *td; struct stack *st; struct sbuf sb; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p); if (error != 0) return (error); kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK); st = stack_create(); lwpidarray = NULL; numthreads = 0; PROC_LOCK(p); repeat: if (numthreads < p->p_numthreads) { if (lwpidarray != NULL) { free(lwpidarray, M_TEMP); lwpidarray = NULL; } numthreads = p->p_numthreads; PROC_UNLOCK(p); lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP, M_WAITOK | M_ZERO); PROC_LOCK(p); goto repeat; } i = 0; /* * XXXRW: During the below loop, execve(2) and countless other sorts * of changes could have taken place. Should we check to see if the * vmspace has been replaced, or the like, in order to prevent * giving a snapshot that spans, say, execve(2), with some threads * before and some after? Among other things, the credentials could * have changed, in which case the right to extract debug info might * no longer be assured. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(i < numthreads, ("sysctl_kern_proc_kstack: numthreads")); lwpidarray[i] = td->td_tid; i++; } numthreads = i; for (i = 0; i < numthreads; i++) { td = thread_find(p, lwpidarray[i]); if (td == NULL) { continue; } bzero(kkstp, sizeof(*kkstp)); (void)sbuf_new(&sb, kkstp->kkst_trace, sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN); thread_lock(td); kkstp->kkst_tid = td->td_tid; if (TD_IS_SWAPPED(td)) { kkstp->kkst_state = KKST_STATE_SWAPPED; } else if (TD_IS_RUNNING(td)) { if (stack_save_td_running(st, td) == 0) kkstp->kkst_state = KKST_STATE_STACKOK; else kkstp->kkst_state = KKST_STATE_RUNNING; } else { kkstp->kkst_state = KKST_STATE_STACKOK; stack_save_td(st, td); } thread_unlock(td); PROC_UNLOCK(p); stack_sbuf_print(&sb, st); sbuf_finish(&sb); sbuf_delete(&sb); error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp)); PROC_LOCK(p); if (error) break; } _PRELE(p); PROC_UNLOCK(p); if (lwpidarray != NULL) free(lwpidarray, M_TEMP); stack_destroy(st); free(kkstp, M_TEMP); return (error); } #endif /* * This sysctl allows a process to retrieve the full list of groups from * itself or another process. */ static int sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct ucred *cred; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; PROC_LOCK(p); } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } cred = crhold(p->p_ucred); PROC_UNLOCK(p); error = SYSCTL_OUT(req, cred->cr_groups, cred->cr_ngroups * sizeof(gid_t)); crfree(cred); return (error); } /* * This sysctl allows a process to retrieve or/and set the resource limit for * another process. */ static int sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rlimit rlim; struct proc *p; u_int which; int flags, error; if (namelen != 2) return (EINVAL); which = (u_int)name[1]; if (which >= RLIM_NLIMITS) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(rlim)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); /* * Retrieve limit. */ if (req->oldptr != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); } error = SYSCTL_OUT(req, &rlim, sizeof(rlim)); if (error != 0) goto errout; /* * Set limit. */ if (req->newptr != NULL) { error = SYSCTL_IN(req, &rlim, sizeof(rlim)); if (error == 0) error = kern_proc_setrlimit(curthread, p, which, &rlim); } errout: PRELE(p); return (error); } /* * This sysctl allows a process to retrieve ps_strings structure location of * another process. */ static int sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; vm_offset_t ps_strings; int error; #ifdef COMPAT_FREEBSD32 uint32_t ps_strings32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { /* * We return 0 if the 32 bit emulation request is for a 64 bit * process. */ ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ? PTROUT(p->p_sysent->sv_psstrings) : 0; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32)); return (error); } #endif ps_strings = p->p_sysent->sv_psstrings; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings)); return (error); } /* * This sysctl allows a process to retrieve umask of another process. */ static int sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int error; u_short fd_cmask; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); FILEDESC_SLOCK(p->p_fd); fd_cmask = p->p_fd->fd_cmask; FILEDESC_SUNLOCK(p->p_fd); PRELE(p); error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask)); return (error); } /* * This sysctl allows a process to set and retrieve binary osreldate of * another process. */ static int sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, error, osrel; if (namelen != 1) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(osrel)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel)); if (error != 0) goto errout; if (req->newptr != NULL) { error = SYSCTL_IN(req, &osrel, sizeof(osrel)); if (error != 0) goto errout; if (osrel < 0) { error = EINVAL; goto errout; } p->p_osrel = osrel; } errout: PRELE(p); return (error); } static int sysctl_kern_proc_sigtramp(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct kinfo_sigtramp kst; const struct sysentvec *sv; int error; #ifdef COMPAT_FREEBSD32 struct kinfo_sigtramp32 kst32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); sv = p->p_sysent; #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { bzero(&kst32, sizeof(kst32)); if (SV_PROC_FLAG(p, SV_ILP32)) { if (sv->sv_sigcode_base != 0) { kst32.ksigtramp_start = sv->sv_sigcode_base; kst32.ksigtramp_end = sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst32.ksigtramp_start = sv->sv_psstrings - *sv->sv_szsigcode; kst32.ksigtramp_end = sv->sv_psstrings; } } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst32, sizeof(kst32)); return (error); } #endif bzero(&kst, sizeof(kst)); if (sv->sv_sigcode_base != 0) { kst.ksigtramp_start = (char *)sv->sv_sigcode_base; kst.ksigtramp_end = (char *)sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst.ksigtramp_start = (char *)sv->sv_psstrings - *sv->sv_szsigcode; kst.ksigtramp_end = (char *)sv->sv_psstrings; } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst, sizeof(kst)); return (error); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT| CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_CAPWR | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_env, "Process environment"); static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); #ifdef COMPAT_FREEBSD7 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries"); #if defined(STACK) || defined(DDB) static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit, "Process resource limits"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings, "Process ps_strings location"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask"); static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel, "Process binary osreldate"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGTRAMP, sigtramp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sigtramp, "Process signal trampoline location"); int allproc_gen; /* * stop_all_proc() purpose is to stop all process which have usermode, * except current process for obvious reasons. This makes it somewhat * unreliable when invoked from multithreaded process. The service * must not be user-callable anyway. */ void stop_all_proc(void) { struct proc *cp, *p; int r, gen; bool restart, seen_stopped, seen_exiting, stopped_some; cp = curproc; allproc_loop: sx_xlock(&allproc_lock); gen = allproc_gen; seen_exiting = seen_stopped = stopped_some = restart = false; LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) { PROC_UNLOCK(p); continue; } if ((p->p_flag & P_WEXIT) != 0) { seen_exiting = true; PROC_UNLOCK(p); continue; } if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { /* * Stopped processes are tolerated when there * are no other processes which might continue * them. P_STOPPED_SINGLE but not * P_TOTAL_STOP process still has at least one * thread running. */ seen_stopped = true; PROC_UNLOCK(p); continue; } _PHOLD(p); sx_xunlock(&allproc_lock); r = thread_single(p, SINGLE_ALLPROC); if (r != 0) restart = true; else stopped_some = true; _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } /* Catch forked children we did not see in iteration. */ if (gen != allproc_gen) restart = true; sx_xunlock(&allproc_lock); if (restart || stopped_some || seen_exiting || seen_stopped) { kern_yield(PRI_USER); goto allproc_loop; } } void resume_all_proc(void) { struct proc *cp, *p; cp = curproc; sx_xlock(&allproc_lock); LIST_REMOVE(cp, p_list); LIST_INSERT_HEAD(&allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) break; LIST_REMOVE(cp, p_list); LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & P_TOTAL_STOP) != 0) { sx_xunlock(&allproc_lock); _PHOLD(p); thread_single_end(p, SINGLE_ALLPROC); _PRELE(p); PROC_UNLOCK(p); sx_xlock(&allproc_lock); } else { PROC_UNLOCK(p); } } sx_xunlock(&allproc_lock); } /* #define TOTAL_STOP_DEBUG 1 */ #ifdef TOTAL_STOP_DEBUG volatile static int ap_resume; #include static int sysctl_debug_stop_all_proc(SYSCTL_HANDLER_ARGS) { int error, val; val = 0; ap_resume = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0) { stop_all_proc(); syncer_suspend(); while (ap_resume == 0) ; syncer_resume(); resume_all_proc(); } return (0); } SYSCTL_PROC(_debug, OID_AUTO, stop_all_proc, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, __DEVOLATILE(int *, &ap_resume), 0, sysctl_debug_stop_all_proc, "I", ""); #endif Index: head/sys/kern/kern_sysctl.c =================================================================== --- head/sys/kern/kern_sysctl.c (revision 298068) +++ head/sys/kern/kern_sysctl.c (revision 298069) @@ -1,1995 +1,1995 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. */ static struct rmlock sysctllock; static struct sx sysctlmemlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, list, oid_link) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[64]; ssize_t rem = sizeof(path); ssize_t len; uint8_t val_8; uint16_t val_16; uint32_t val_32; int val_int; long val_long; int64_t val_64; quad_t val_quad; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_int(path + rem, &val_int) == 0) return; req.newlen = sizeof(val_int); req.newptr = &val_int; break; case CTLTYPE_UINT: if (getenv_uint(path + rem, (unsigned int *)&val_int) == 0) return; req.newlen = sizeof(val_int); req.newptr = &val_int; break; case CTLTYPE_LONG: if (getenv_long(path + rem, &val_long) == 0) return; req.newlen = sizeof(val_long); req.newptr = &val_long; break; case CTLTYPE_ULONG: if (getenv_ulong(path + rem, (unsigned long *)&val_long) == 0) return; req.newlen = sizeof(val_long); req.newptr = &val_long; break; case CTLTYPE_S8: if (getenv_int(path + rem, &val_int) == 0) return; val_8 = val_int; req.newlen = sizeof(val_8); req.newptr = &val_8; break; case CTLTYPE_S16: if (getenv_int(path + rem, &val_int) == 0) return; val_16 = val_int; req.newlen = sizeof(val_16); req.newptr = &val_16; break; case CTLTYPE_S32: if (getenv_long(path + rem, &val_long) == 0) return; val_32 = val_long; req.newlen = sizeof(val_32); req.newptr = &val_32; break; case CTLTYPE_S64: if (getenv_quad(path + rem, &val_quad) == 0) return; val_64 = val_quad; req.newlen = sizeof(val_64); req.newptr = &val_64; break; case CTLTYPE_U8: if (getenv_uint(path + rem, (unsigned int *)&val_int) == 0) return; val_8 = val_int; req.newlen = sizeof(val_8); req.newptr = &val_8; break; case CTLTYPE_U16: if (getenv_uint(path + rem, (unsigned int *)&val_int) == 0) return; val_16 = val_int; req.newlen = sizeof(val_16); req.newptr = &val_16; break; case CTLTYPE_U32: if (getenv_ulong(path + rem, (unsigned long *)&val_long) == 0) return; val_32 = val_long; req.newlen = sizeof(val_32); req.newptr = &val_32; break; case CTLTYPE_U64: /* XXX there is no getenv_uquad() */ if (getenv_quad(path + rem, &val_quad) == 0) return; val_64 = val_quad; req.newlen = sizeof(val_64); req.newptr = &val_64; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { printf("can't re-use a leaf (%s)!\n", p->oid_name); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accomodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ retry: q = NULL; SLIST_FOREACH(p, parent, oid_link) { /* check if the current OID number is in use */ if (oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); goto retry; } else { oid_number++; } } else if (oid_number < p->oid_number) break; q = p; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; if (q != NULL) SLIST_INSERT_AFTER(q, oidp, oid_link); else SLIST_INSERT_HEAD(parent, oidp, oid_link); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { /* only fetch value once */ oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_unregister_oid(struct sysctl_oid *oidp) { struct sysctl_oid *p; int error; SYSCTL_ASSERT_WLOCKED(); error = ENOENT; if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp) { SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); error = 0; break; } } } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) printf("%s: failed to unregister sysctl\n", __func__); } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occured. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if(e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; error = ENOENT; SYSCTL_WLOCK(); SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { if (strcmp(p->oid_name, name) == 0) { error = sysctl_remove_oid_locked(p, del, recurse); break; } } SYSCTL_WUNLOCK(); return (error); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("can't remove non-dynamic nodes!\n"); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(oidp), oid_link, tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr) { struct sysctl_oid *oidp; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(name, parent); if (oidp != NULL) { if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { SYSCTL_WUNLOCK(); printf("can't re-use a leaf (%s)!\n", name); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; SLIST_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = strdup(name, M_SYSCTLOID); oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, 0); /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {0,0} printf the entire MIB-tree. * {0,1,...} return the name of the "..." OID. * {0,2,...} return the next OID. * {0,3} return the OID of the name in "new" * {0,4,...} return the kind & format info for the "..." OID. * {0,5,...} return the description the "..." OID. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, l, oid_link) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } - lsp2 = 0; + lsp2 = NULL; SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number != *name) continue; if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oid->oid_handler) break; lsp2 = SYSCTL_CHILDREN(oid); break; } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); *len = level; SLIST_FOREACH(oidp, lsp, oid_link) { *next = oidp->oid_number; *oidpp = oidp; if (oidp->oid_kind & CTLFLAG_SKIP) continue; if (!namelen) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) /* We really should call the handler here...*/ return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, len, level+1, oidpp)) return (0); goto emptynode; } if (oidp->oid_number < *name) continue; if (oidp->oid_number > *name) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); goto next; } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) continue; if (oidp->oid_handler) continue; lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); next: namelen = 1; emptynode: *len = level; } return (1); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int i, j, error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int newoid[CTL_MAXNAME]; SYSCTL_RLOCK(&tracker); i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); SYSCTL_RUNLOCK(&tracker); if (i) return (ENOENT); error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; char *p; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { p = strsep(&name, "."); oidp = SLIST_FIRST(lsp); for (;; oidp = SLIST_NEXT(oidp, oid_link)) { if (oidp == NULL) return (ENOENT); if (strcmp(p, oidp->oid_name) == 0) break; } *oid++ = oidp->oid_number; (*len)++; if (name == NULL || *name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; - struct sysctl_oid *op = 0; + struct sysctl_oid *op = NULL; struct rm_priotracker tracker; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ SYSCTL_PROC(_sysctl, 3, name2oid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); /* * Default "handler" functions. */ /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { size_t outlen; int error = 0, ro_string = 0; /* * A zero-length buffer indicates a fixed size read-only * string: */ if (arg2 == 0) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { char *tmparg; if (ro_string) { tmparg = arg1; } else { /* try to make a coherent snapshot of the string */ tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); memcpy(tmparg, arg1, arg2); } outlen = strnlen(tmparg, arg2 - 1) + 1; error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { outlen = strnlen((char *)arg1, arg2 - 1) + 1; error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if ((req->newlen - req->newidx) >= arg2) { error = EINVAL; } else { arg2 = (req->newlen - req->newidx); error = SYSCTL_IN(req, arg1, arg2); ((char *)arg1)[arg2] = '\0'; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = 0; /* sysctl internal magic */ oid[1] = 3; /* name2oid */ oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number == name[indx]) break; } if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; if (old) { if (!useracc(old, req.oldlen, VM_PROT_WRITE)) return (EFAULT); req.oldptr= old; } if (new != NULL) { if (!useracc(new, newlen, VM_PROT_READ)) return (EFAULT); req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif if (req.oldptr && req.oldlen > PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } else memlocked = 0; CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } Index: head/sys/kern/link_elf.c =================================================================== --- head/sys/kern/link_elf.c (revision 298068) +++ head/sys/kern/link_elf.c (revision 298069) @@ -1,1658 +1,1658 @@ /*- * Copyright (c) 1998-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_gdb.h" #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SPARSE_MAPPING #include #include #include #endif #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" #define MAXSEGS 4 typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; /* Was file pre-loaded */ caddr_t address; /* Relocation address */ #ifdef SPARSE_MAPPING vm_object_t object; /* VM object to hold file pages */ #endif Elf_Dyn *dynamic; /* Symbol table etc. */ Elf_Hashelt nbuckets; /* DT_HASH info */ Elf_Hashelt nchains; const Elf_Hashelt *buckets; const Elf_Hashelt *chains; caddr_t hash; caddr_t strtab; /* DT_STRTAB */ int strsz; /* DT_STRSZ */ const Elf_Sym *symtab; /* DT_SYMTAB */ Elf_Addr *got; /* DT_PLTGOT */ const Elf_Rel *pltrel; /* DT_JMPREL */ int pltrelsize; /* DT_PLTRELSZ */ const Elf_Rela *pltrela; /* DT_JMPREL */ int pltrelasize; /* DT_PLTRELSZ */ const Elf_Rel *rel; /* DT_REL */ int relsize; /* DT_RELSZ */ const Elf_Rela *rela; /* DT_RELA */ int relasize; /* DT_RELASZ */ caddr_t modptr; const Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t symbase; /* malloc'ed symbold base */ caddr_t strbase; /* malloc'ed string base */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */ Elf_Addr pcpu_base; /* Relocated pcpu set address. */ #ifdef VIMAGE Elf_Addr vnet_start; /* Pre-relocation vnet set start. */ Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */ Elf_Addr vnet_base; /* Relocated vnet set address. */ #endif #ifdef GDB struct link_map gdb; /* hooks for gdb */ #endif } *elf_file_t; struct elf_set { Elf_Addr es_start; Elf_Addr es_stop; Elf_Addr es_base; TAILQ_ENTRY(elf_set) es_link; }; TAILQ_HEAD(elf_set_head, elf_set); #include static int link_elf_link_common_finish(linker_file_t); static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t, c_linker_sym_t *, long *); static void link_elf_unload_file(linker_file_t); static void link_elf_unload_preload(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static void link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32", #else "elf64", #endif link_elf_methods, sizeof(struct elf_file) }; static int parse_dynamic(elf_file_t); static int relocate_file(elf_file_t); static int link_elf_preload_parse_symbols(elf_file_t); static struct elf_set_head set_pcpu_list; #ifdef VIMAGE static struct elf_set_head set_vnet_list; #endif static void elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base) { struct elf_set *set, *iter; set = malloc(sizeof(*set), M_LINKER, M_WAITOK); set->es_start = start; set->es_stop = stop; set->es_base = base; TAILQ_FOREACH(iter, list, es_link) { KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) || (set->es_start > iter->es_start && set->es_stop > iter->es_stop), ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx", (uintmax_t)set->es_start, (uintmax_t)set->es_stop, (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop)); if (iter->es_start > set->es_start) { TAILQ_INSERT_BEFORE(iter, set, es_link); break; } } if (iter == NULL) TAILQ_INSERT_TAIL(list, set, es_link); } static int elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (addr < set->es_start) return (0); if (addr < set->es_stop) { *start = set->es_start; *base = set->es_base; return (1); } } return (0); } static void elf_set_delete(struct elf_set_head *list, Elf_Addr start) { struct elf_set *set; TAILQ_FOREACH(set, list, es_link) { if (start < set->es_start) break; if (start == set->es_start) { TAILQ_REMOVE(list, set, es_link); free(set, M_LINKER); return; } } KASSERT(0, ("deleting unknown linker set (start = 0x%jx)", (uintmax_t)start)); } #ifdef GDB static void r_debug_state(struct r_debug *, struct link_map *); /* * A list of loaded modules for GDB to use for loading symbols. */ struct r_debug r_debug; #define GDB_STATE(s) do { \ r_debug.r_state = s; r_debug_state(NULL, NULL); \ } while (0) /* * Function for the debugger to set a breakpoint on to gain control. */ static void r_debug_state(struct r_debug *dummy_one __unused, struct link_map *dummy_two __unused) { } static void link_elf_add_gdb(struct link_map *l) { struct link_map *prev; l->l_next = NULL; if (r_debug.r_map == NULL) { /* Add first. */ l->l_prev = NULL; r_debug.r_map = l; } else { /* Append to list. */ for (prev = r_debug.r_map; prev->l_next != NULL; prev = prev->l_next) ; l->l_prev = prev; prev->l_next = l; } } static void link_elf_delete_gdb(struct link_map *l) { if (l->l_prev == NULL) { /* Remove first. */ if ((r_debug.r_map = l->l_next) != NULL) l->l_next->l_prev = NULL; } else { /* Remove any but first. */ if ((l->l_prev->l_next = l->l_next) != NULL) l->l_next->l_prev = l->l_prev; } } #endif /* GDB */ /* * The kernel symbol table starts here. */ extern struct _dynamic _DYNAMIC; static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } /* * Actions performed after linking/loading both the preloaded kernel and any * modules; whether preloaded or dynamicly loaded. */ static int link_elf_link_common_finish(linker_file_t lf) { #ifdef GDB elf_file_t ef = (elf_file_t)lf; char *newfilename; #endif int error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error != 0) return (error); #ifdef GDB GDB_STATE(RT_ADD); ef->gdb.l_addr = lf->address; newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK); strcpy(newfilename, lf->filename); ef->gdb.l_name = newfilename; ef->gdb.l_ld = ef->dynamic; link_elf_add_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); #endif /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } extern vm_offset_t __startkernel; static void link_elf_init(void* arg) { Elf_Dyn *dp; Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr; elf_file_t ef; char *modname; linker_add_class(&link_elf_class); dp = (Elf_Dyn *)&_DYNAMIC; modname = NULL; modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel"); if (modptr == NULL) modptr = preload_search_by_type("elf kernel"); modname = (char *)preload_search_info(modptr, MODINFO_NAME); if (modname == NULL) modname = "kernel"; linker_kernel_file = linker_make_file(modname, &link_elf_class); if (linker_kernel_file == NULL) panic("%s: Can't create linker structures for kernel", __func__); ef = (elf_file_t) linker_kernel_file; ef->preloaded = 1; #ifdef __powerpc__ ef->address = (caddr_t) (__startkernel - KERNBASE); #else ef->address = 0; #endif #ifdef SPARSE_MAPPING ef->object = 0; #endif ef->dynamic = dp; if (dp != NULL) parse_dynamic(ef); linker_kernel_file->address += KERNBASE; linker_kernel_file->size = -(intptr_t)linker_kernel_file->address; if (modptr != NULL) { ef->modptr = modptr; baseptr = preload_search_info(modptr, MODINFO_ADDR); if (baseptr != NULL) linker_kernel_file->address = *(caddr_t *)baseptr; sizeptr = preload_search_info(modptr, MODINFO_SIZE); if (sizeptr != NULL) linker_kernel_file->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { linker_kernel_file->ctors_addr = ef->address + *ctors_addrp; linker_kernel_file->ctors_size = *ctors_sizep; } } (void)link_elf_preload_parse_symbols(ef); #ifdef GDB r_debug.r_map = NULL; r_debug.r_brk = r_debug_state; r_debug.r_state = RT_CONSISTENT; #endif (void)link_elf_link_common_finish(linker_kernel_file); linker_kernel_file->flags |= LINKER_FILE_LINKED; TAILQ_INIT(&set_pcpu_list); #ifdef VIMAGE TAILQ_INIT(&set_vnet_list); #endif } SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0); static int link_elf_preload_parse_symbols(elf_file_t ef) { caddr_t pointer; caddr_t ssym, esym, base; caddr_t strtab; int strcnt; Elf_Sym *symtab; int symcnt; if (ef->modptr == NULL) return (0); pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_SSYM); if (pointer == NULL) return (0); ssym = *(caddr_t *)pointer; pointer = preload_search_info(ef->modptr, MODINFO_METADATA | MODINFOMD_ESYM); if (pointer == NULL) return (0); esym = *(caddr_t *)pointer; base = ssym; symcnt = *(long *)base; base += sizeof(long); symtab = (Elf_Sym *)base; base += roundup(symcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } strcnt = *(long *)base; base += sizeof(long); strtab = base; base += roundup(strcnt, sizeof(long)); if (base > esym || base < ssym) { printf("Symbols are corrupt!\n"); return (EINVAL); } ef->ddbsymtab = symtab; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbstrtab = strtab; ef->ddbstrcnt = strcnt; return (0); } static int parse_dynamic(elf_file_t ef) { Elf_Dyn *dp; int plttype = DT_REL; for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { switch (dp->d_tag) { case DT_HASH: { /* From src/libexec/rtld-elf/rtld.c */ const Elf_Hashelt *hashtab = (const Elf_Hashelt *) (ef->address + dp->d_un.d_ptr); ef->nbuckets = hashtab[0]; ef->nchains = hashtab[1]; ef->buckets = hashtab + 2; ef->chains = ef->buckets + ef->nbuckets; break; } case DT_STRTAB: ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); break; case DT_STRSZ: ef->strsz = dp->d_un.d_val; break; case DT_SYMTAB: ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); break; case DT_SYMENT: if (dp->d_un.d_val != sizeof(Elf_Sym)) return (ENOEXEC); break; case DT_PLTGOT: ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); break; case DT_REL: ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_RELSZ: ef->relsize = dp->d_un.d_val; break; case DT_RELENT: if (dp->d_un.d_val != sizeof(Elf_Rel)) return (ENOEXEC); break; case DT_JMPREL: ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); break; case DT_PLTRELSZ: ef->pltrelsize = dp->d_un.d_val; break; case DT_RELA: ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); break; case DT_RELASZ: ef->relasize = dp->d_un.d_val; break; case DT_RELAENT: if (dp->d_un.d_val != sizeof(Elf_Rela)) return (ENOEXEC); break; case DT_PLTREL: plttype = dp->d_un.d_val; if (plttype != DT_REL && plttype != DT_RELA) return (ENOEXEC); break; #ifdef GDB case DT_DEBUG: dp->d_un.d_ptr = (Elf_Addr)&r_debug; break; #endif } } if (plttype == DT_RELA) { ef->pltrela = (const Elf_Rela *)ef->pltrel; ef->pltrel = NULL; ef->pltrelasize = ef->pltrelsize; ef->pltrelsize = 0; } ef->ddbsymtab = ef->symtab; ef->ddbsymcnt = ef->nchains; ef->ddbstrtab = ef->strtab; ef->ddbstrcnt = ef->strsz; return (0); } static int parse_dpcpu(elf_file_t ef) { int count; int error; ef->pcpu_start = 0; ef->pcpu_stop = 0; error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start, (void ***)&ef->pcpu_stop, &count); /* Error just means there is no pcpu set to relocate. */ if (error != 0) return (0); count *= sizeof(void *); /* * Allocate space in the primary pcpu area. Copy in our * initialization from the data section and then initialize * all per-cpu storage from that. */ ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count); if (ef->pcpu_base == 0) return (ENOSPC); memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count); dpcpu_copy((void *)ef->pcpu_base, count); elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop, ef->pcpu_base); return (0); } #ifdef VIMAGE static int parse_vnet(elf_file_t ef) { int count; int error; ef->vnet_start = 0; ef->vnet_stop = 0; error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start, (void ***)&ef->vnet_stop, &count); /* Error just means there is no vnet data set to relocate. */ if (error != 0) return (0); count *= sizeof(void *); /* * Allocate space in the primary vnet area. Copy in our * initialization from the data section and then initialize * all per-vnet storage from that. */ ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count); if (ef->vnet_base == 0) return (ENOSPC); memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count); vnet_data_copy((void *)ef->vnet_base, count); elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop, ef->vnet_base); return (0); } #endif static int link_elf_link_preload(linker_class_t cls, const char* filename, linker_file_t *result) { Elf_Addr *ctors_addrp; Elf_Size *ctors_sizep; caddr_t modptr, baseptr, sizeptr, dynptr; char *type; elf_file_t ef; linker_file_t lf; int error; vm_offset_t dp; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return (ENOENT); type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); dynptr = preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_DYNAMIC); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 && strcmp(type, "elf module") != 0)) return (EFTYPE); if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t) lf; ef->preloaded = 1; ef->modptr = modptr; ef->address = *(caddr_t *)baseptr; #ifdef SPARSE_MAPPING ef->object = 0; #endif dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; ef->dynamic = (Elf_Dyn *)dp; lf->address = ef->address; lf->size = *(size_t *)sizeptr; ctors_addrp = (Elf_Addr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_ADDR); ctors_sizep = (Elf_Size *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_CTORS_SIZE); if (ctors_addrp != NULL && ctors_sizep != NULL) { lf->ctors_addr = ef->address + *ctors_addrp; lf->ctors_size = *ctors_sizep; } error = parse_dynamic(ef); if (error == 0) error = parse_dpcpu(ef); #ifdef VIMAGE if (error == 0) error = parse_vnet(ef); #endif if (error != 0) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } link_elf_reloc_local(lf); *result = lf; return (0); } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t) lf; error = relocate_file(ef); if (error != 0) return (error); (void)link_elf_preload_parse_symbols(ef); return (link_elf_link_common_finish(lf)); } static int link_elf_load_file(linker_class_t cls, const char* filename, linker_file_t* result) { struct nameidata nd; struct thread* td = curthread; /* XXX */ Elf_Ehdr *hdr; caddr_t firstpage; int nbytes, i; Elf_Phdr *phdr; Elf_Phdr *phlimit; Elf_Phdr *segs[MAXSEGS]; int nsegs; Elf_Phdr *phdyn; Elf_Phdr *phphdr; caddr_t mapbase; size_t mapsize; Elf_Off base_offset; Elf_Addr base_vaddr; Elf_Addr base_vlimit; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; Elf_Shdr *shdr; int symtabindex; int symstrindex; int shstrindex; int symcnt; int strcnt; char *shstrs; shdr = NULL; lf = NULL; shstrs = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; firstpage = NULL; goto out; } #ifdef MAC error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp); if (error != 0) { firstpage = NULL; goto out; } #endif /* * Read the elf header from the file. */ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); hdr = (Elf_Ehdr *)firstpage; error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); nbytes = PAGE_SIZE - resid; if (error != 0) goto out; if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } /* * We rely on the program header being in the first page. * This is not strictly required by the ABI specification, but * it seems to always true in practice. And, it simplifies * things considerably. */ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) link_elf_error(filename, "Unreadable program headers"); /* * Scan the program header entries, and save key information. * * We rely on there being exactly two load segments, text and data, * in that order. */ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); phlimit = phdr + hdr->e_phnum; nsegs = 0; phdyn = NULL; phphdr = NULL; while (phdr < phlimit) { switch (phdr->p_type) { case PT_LOAD: if (nsegs == MAXSEGS) { link_elf_error(filename, "Too many sections"); error = ENOEXEC; goto out; } /* * XXX: We just trust they come in right order ?? */ segs[nsegs] = phdr; ++nsegs; break; case PT_PHDR: phphdr = phdr; break; case PT_DYNAMIC: phdyn = phdr; break; case PT_INTERP: error = ENOSYS; goto out; } ++phdr; } if (phdyn == NULL) { link_elf_error(filename, "Object is not dynamically-linked"); error = ENOEXEC; goto out; } if (nsegs == 0) { link_elf_error(filename, "No sections"); error = ENOEXEC; goto out; } /* * Allocate the entire address space of the object, to stake * out our contiguous region, and to establish the base * address for relocation. */ base_offset = trunc_page(segs[0]->p_offset); base_vaddr = trunc_page(segs[0]->p_vaddr); base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + segs[nsegs - 1]->p_memsz); mapsize = base_vlimit - base_vaddr; lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; #ifdef SPARSE_MAPPING ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); error = vm_map_find(kernel_map, ef->object, 0, (vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != 0) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } #else ef->address = malloc(mapsize, M_LINKER, M_WAITOK); #endif mapbase = ef->address; /* * Read the text and data sections and zero the bss. */ for (i = 0; i < nsegs; i++) { caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; error = vn_rdwr(UIO_READ, nd.ni_vp, segbase, segs[i]->p_filesz, segs[i]->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; bzero(segbase + segs[i]->p_filesz, segs[i]->p_memsz - segs[i]->p_filesz); #ifdef SPARSE_MAPPING /* * Wire down the pages */ error = vm_map_wire(kernel_map, (vm_offset_t) segbase, (vm_offset_t) segbase + segs[i]->p_memsz, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } #endif } #ifdef GPROF /* Update profiling information with the new text segment. */ mtx_lock(&Giant); kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr + segs[0]->p_memsz)); mtx_unlock(&Giant); #endif ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); lf->address = ef->address; lf->size = mapsize; error = parse_dynamic(ef); if (error != 0) goto out; error = parse_dpcpu(ef); if (error != 0) goto out; #ifdef VIMAGE error = parse_vnet(ef); if (error != 0) goto out; #endif link_elf_reloc_local(lf); VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto out; error = relocate_file(ef); if (error != 0) goto out; /* * Try and load the symbol table if it's present. (you can * strip it!) */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0) goto nosyms; shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; /* Read section string table */ shstrindex = hdr->e_shstrndx; if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB && shdr[shstrindex].sh_size != 0) { nbytes = shdr[shstrindex].sh_size; shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO); error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; } symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_type == SHT_SYMTAB) { symtabindex = i; symstrindex = shdr[i].sh_link; } else if (shstrs != NULL && shdr[i].sh_name != 0 && strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) { /* Record relocated address and size of .ctors. */ lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr; lf->ctors_size = shdr[i].sh_size; } } if (symtabindex < 0 || symstrindex < 0) goto nosyms; symcnt = shdr[symtabindex].sh_size; ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); strcnt = shdr[symstrindex].sh_size; ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->symbase, symcnt, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; error = vn_rdwr(UIO_READ, nd.ni_vp, ef->strbase, strcnt, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error != 0) goto out; ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); ef->ddbsymtab = (const Elf_Sym *)ef->symbase; ef->ddbstrcnt = strcnt; ef->ddbstrtab = ef->strbase; nosyms: error = link_elf_link_common_finish(lf); if (error != 0) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error != 0 && lf != NULL) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(shdr, M_LINKER); free(firstpage, M_LINKER); free(shstrs, M_LINKER); return (error); } Elf_Addr elf_relocaddr(linker_file_t lf, Elf_Addr x) { elf_file_t ef; ef = (elf_file_t)lf; if (x >= ef->pcpu_start && x < ef->pcpu_stop) return ((x - ef->pcpu_start) + ef->pcpu_base); #ifdef VIMAGE if (x >= ef->vnet_start && x < ef->vnet_stop) return ((x - ef->vnet_start) + ef->vnet_base); #endif return (x); } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; if (ef->pcpu_base != 0) { dpcpu_free((void *)ef->pcpu_base, ef->pcpu_stop - ef->pcpu_start); elf_set_delete(&set_pcpu_list, ef->pcpu_start); } #ifdef VIMAGE if (ef->vnet_base != 0) { vnet_data_free((void *)ef->vnet_base, ef->vnet_stop - ef->vnet_start); elf_set_delete(&set_vnet_list, ef->vnet_start); } #endif #ifdef GDB if (ef->gdb.l_ld != NULL) { GDB_STATE(RT_DELETE); free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER); link_elf_delete_gdb(&ef->gdb); GDB_STATE(RT_CONSISTENT); } #endif /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->preloaded) { link_elf_unload_preload(file); return; } #ifdef SPARSE_MAPPING if (ef->object != NULL) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } #else free(ef->address, M_LINKER); #endif free(ef->symbase, M_LINKER); free(ef->strbase, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static void link_elf_unload_preload(linker_file_t file) { if (file->filename != NULL) preload_delete_name(file->filename); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->symtab + ELF_R_SYM(r_info); return (ef->strtab + ref->st_name); } return (NULL); } static int relocate_file(elf_file_t ef) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; /* Perform relocations without addend if there are any: */ rel = ef->rel; if (rel != NULL) { rellim = (const Elf_Rel *) ((const char *)ef->rel + ef->relsize); while (rel < rellim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rel++; } } /* Perform relocations with addend if there are any: */ rela = ef->rela; if (rela != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rela++; } } /* Perform PLT relocations without addend if there are any: */ rel = ef->pltrel; if (rel != NULL) { rellim = (const Elf_Rel *) ((const char *)ef->pltrel + ef->pltrelsize); while (rel < rellim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rel++; } } /* Perform relocations with addend if there are any: */ rela = ef->pltrela; if (rela != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->pltrela + ef->pltrelasize); while (rela < relalim) { if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf: symbol %s undefined\n", symname); return (ENOENT); } rela++; } } return (0); } /* * Hash function for symbol table lookup. Don't even think about changing * this. It is specified by the System V ABI. */ static unsigned long elf_hash(const char *name) { const unsigned char *p = (const unsigned char *) name; unsigned long h = 0; unsigned long g; while (*p != '\0') { h = (h << 4) + *p++; if ((g = h & 0xf0000000) != 0) h ^= g >> 24; h &= ~g; } return (h); } static int link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym) { elf_file_t ef = (elf_file_t) lf; unsigned long symnum; const Elf_Sym* symp; const char *strp; unsigned long hash; int i; /* If we don't have a hash, bail. */ if (ef->buckets == NULL || ef->nbuckets == 0) { printf("link_elf_lookup_symbol: missing symbol hash table\n"); return (ENOENT); } /* First, search hashed global symbols */ hash = elf_hash(name); symnum = ef->buckets[hash % ef->nbuckets]; while (symnum != STN_UNDEF) { if (symnum >= ef->nchains) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } symp = ef->symtab + symnum; if (symp->st_name == 0) { printf("%s: corrupt symbol table\n", __func__); return (ENOENT); } strp = ef->strtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } symnum = ef->chains[symnum]; } /* If we have not found it, look at the full table (if loaded) */ if (ef->symtab == ef->ddbsymtab) return (ENOENT); /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (strcmp(name, strp) == 0) { if (symp->st_shndx != SHN_UNDEF || (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { *sym = (c_linker_sym_t) symp; return (0); } return (ENOENT); } } return (ENOENT); } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym* es = (const Elf_Sym*) sym; if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) { symval->name = ef->strtab + es->st_name; symval->value = (caddr_t) ef->address + es->st_value; symval->size = es->st_size; return (0); } if (ef->symtab == ef->ddbsymtab) return (ENOENT); if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; symval->value = (caddr_t) ef->address + es->st_value; symval->size = es->st_size; return (0); } return (ENOENT); } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym* es; - const Elf_Sym* best = 0; + const Elf_Sym* best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value + (uintptr_t) (void *) ef->address; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } - if (best == 0) + if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return (0); } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { c_linker_sym_t sym; linker_symval_t symval; char *setsym; void **start, **stop; int len, error = 0, count; len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */ setsym = malloc(len, M_LINKER, M_WAITOK); /* get address of first entry */ snprintf(setsym, len, "%s%s", "__start_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } start = (void **)symval.value; /* get address of last entry */ snprintf(setsym, len, "%s%s", "__stop_set_", name); error = link_elf_lookup_symbol(lf, setsym, &sym); if (error != 0) goto out; link_elf_symbol_values(lf, sym, &symval); if (symval.value == 0) { error = ESRCH; goto out; } stop = (void **)symval.value; /* and the number of entries */ count = stop - start; /* and copy out */ if (startp != NULL) *startp = start; if (stopp != NULL) *stopp = stop; if (countp != NULL) *countp = count; out: free(setsym, M_LINKER); return (error); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error != 0) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error != 0) return (error); error = callback(file, i, &symval, opaque); if (error != 0) return (error); } } return (0); } const Elf_Sym * elf_get_sym(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; if (symidx >= ef->nchains) return (NULL); return (ef->symtab + symidx); } const char * elf_get_symname(linker_file_t lf, Elf_Size symidx) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; if (symidx >= ef->nchains) return (NULL); sym = ef->symtab + symidx; return (ef->strtab + sym->st_name); } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; const Elf_Sym *sym; const char *symbol; Elf_Addr addr, start, base; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->nchains) { *res = 0; return (EINVAL); } sym = ef->symtab + symidx; /* * Don't do a full lookup when the symbol is local. It may even * fail because it may not be found through the hash table. */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) { /* Force lookup failure when we have an insanity. */ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) { *res = 0; return (EINVAL); } *res = ((Elf_Addr)ef->address + sym->st_value); return (0); } /* * XXX we can avoid doing a hash table based lookup for global * symbols as well. This however is not always valid, so we'll * just do it the hard way for now. Performance tweaks can * always be added. */ symbol = ef->strtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps)); if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) { *res = 0; return (EINVAL); } if (elf_set_find(&set_pcpu_list, addr, &start, &base)) addr = addr - start + base; #ifdef VIMAGE else if (elf_set_find(&set_vnet_list, addr, &start, &base)) addr = addr - start + base; #endif *res = addr; return (0); } static void link_elf_reloc_local(linker_file_t lf) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; elf_file_t ef = (elf_file_t)lf; /* Perform relocations without addend if there are any: */ if ((rel = ef->rel) != NULL) { rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize); while (rel < rellim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rel, ELF_RELOC_REL, elf_lookup); rel++; } } /* Perform relocations with addend if there are any: */ if ((rela = ef->rela) != NULL) { relalim = (const Elf_Rela *) ((const char *)ef->rela + ef->relasize); while (rela < relalim) { elf_reloc_local(lf, (Elf_Addr)ef->address, rela, ELF_RELOC_RELA, elf_lookup); rela++; } } } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } Index: head/sys/kern/link_elf_obj.c =================================================================== --- head/sys/kern/link_elf_obj.c (revision 298068) +++ head/sys/kern/link_elf_obj.c (revision 298069) @@ -1,1513 +1,1513 @@ /*- * Copyright (c) 1998-2000 Doug Rabson * Copyright (c) 2004 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB_CTF #include #endif #include "linker_if.h" typedef struct { void *addr; Elf_Off size; int flags; int sec; /* Original section */ char *name; } Elf_progent; typedef struct { Elf_Rel *rel; int nrel; int sec; } Elf_relent; typedef struct { Elf_Rela *rela; int nrela; int sec; } Elf_relaent; typedef struct elf_file { struct linker_file lf; /* Common fields */ int preloaded; caddr_t address; /* Relocation address */ vm_object_t object; /* VM object to hold file pages */ Elf_Shdr *e_shdr; Elf_progent *progtab; int nprogtab; Elf_relaent *relatab; int nrelatab; Elf_relent *reltab; int nreltab; Elf_Sym *ddbsymtab; /* The symbol table we are using */ long ddbsymcnt; /* Number of symbols */ caddr_t ddbstrtab; /* String table */ long ddbstrcnt; /* number of bytes in string table */ caddr_t shstrtab; /* Section name string table */ long shstrcnt; /* number of bytes in string table */ caddr_t ctftab; /* CTF table */ long ctfcnt; /* number of bytes in CTF table */ caddr_t ctfoff; /* CTF offset table */ caddr_t typoff; /* Type offset table */ long typlen; /* Number of type entries. */ } *elf_file_t; #include static int link_elf_link_preload(linker_class_t cls, const char *, linker_file_t *); static int link_elf_link_preload_finish(linker_file_t); static int link_elf_load_file(linker_class_t, const char *, linker_file_t *); static int link_elf_lookup_symbol(linker_file_t, const char *, c_linker_sym_t *); static int link_elf_symbol_values(linker_file_t, c_linker_sym_t, linker_symval_t *); static int link_elf_search_symbol(linker_file_t, caddr_t value, c_linker_sym_t *sym, long *diffp); static void link_elf_unload_file(linker_file_t); static int link_elf_lookup_set(linker_file_t, const char *, void ***, void ***, int *); static int link_elf_each_function_name(linker_file_t, int (*)(const char *, void *), void *); static int link_elf_each_function_nameval(linker_file_t, linker_function_nameval_callback_t, void *); static int link_elf_reloc_local(linker_file_t); static long link_elf_symtab_get(linker_file_t, const Elf_Sym **); static long link_elf_strtab_get(linker_file_t, caddr_t *); static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *); static kobj_method_t link_elf_methods[] = { KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol), KOBJMETHOD(linker_symbol_values, link_elf_symbol_values), KOBJMETHOD(linker_search_symbol, link_elf_search_symbol), KOBJMETHOD(linker_unload, link_elf_unload_file), KOBJMETHOD(linker_load_file, link_elf_load_file), KOBJMETHOD(linker_link_preload, link_elf_link_preload), KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish), KOBJMETHOD(linker_lookup_set, link_elf_lookup_set), KOBJMETHOD(linker_each_function_name, link_elf_each_function_name), KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval), KOBJMETHOD(linker_ctf_get, link_elf_ctf_get), KOBJMETHOD(linker_symtab_get, link_elf_symtab_get), KOBJMETHOD(linker_strtab_get, link_elf_strtab_get), { 0, 0 } }; static struct linker_class link_elf_class = { #if ELF_TARG_CLASS == ELFCLASS32 "elf32_obj", #else "elf64_obj", #endif link_elf_methods, sizeof(struct elf_file) }; static int relocate_file(elf_file_t ef); static void elf_obj_cleanup_globals_cache(elf_file_t); static void link_elf_error(const char *filename, const char *s) { if (filename == NULL) printf("kldload: %s\n", s); else printf("kldload: %s: %s\n", filename, s); } static void link_elf_init(void *arg) { linker_add_class(&link_elf_class); } SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) { Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; void *modptr, *baseptr, *sizeptr; char *type; elf_file_t ef; linker_file_t lf; Elf_Addr off; int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex; /* Look to see if we have the file preloaded */ modptr = preload_search_by_name(filename); if (modptr == NULL) return ENOENT; type = (char *)preload_search_info(modptr, MODINFO_TYPE); baseptr = preload_search_info(modptr, MODINFO_ADDR); sizeptr = preload_search_info(modptr, MODINFO_SIZE); hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_ELFHDR); shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_SHDR); if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " obj module") != 0 && strcmp(type, "elf obj module") != 0)) { return (EFTYPE); } if (baseptr == NULL || sizeptr == NULL || hdr == NULL || shdr == NULL) return (EINVAL); lf = linker_make_file(filename, &link_elf_class); if (lf == NULL) return (ENOMEM); ef = (elf_file_t)lf; ef->preloaded = 1; ef->address = *(caddr_t *)baseptr; lf->address = *(caddr_t *)baseptr; lf->size = *(size_t *)sizeptr; if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT || hdr->e_type != ET_REL || hdr->e_machine != ELF_TARG_MACH) { error = EFTYPE; goto out; } ef->e_shdr = shdr; /* Scan the section header for information and table sizing. */ symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->nprogtab++; break; case SHT_SYMTAB: symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; } } shstrindex = hdr->e_shstrndx; if (ef->nprogtab == 0 || symstrindex < 0 || symstrindex >= hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 || shstrindex >= hdr->e_shnum || shdr[shstrindex].sh_type != SHT_STRTAB) { printf("%s: bad/missing section headers\n", filename); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } /* XXX, relocate the sh_addr fields saved by the loader. */ off = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off)) off = shdr[i].sh_addr; } for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_addr != 0) shdr[i].sh_addr = shdr[i].sh_addr - off + (Elf_Addr)ef->address; } ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr; ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = (char *)shdr[shstrindex].sh_addr; /* Now fill out progtab and the relocation tables. */ pb = 0; rl = 0; ra = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->progtab[pb].addr = (void *)shdr[i].sh_addr; if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { void *dpcpu; dpcpu = dpcpu_alloc(shdr[i].sh_size); if (dpcpu == NULL) { error = ENOSPC; goto out; } memcpy(dpcpu, ef->progtab[pb].addr, ef->progtab[pb].size); dpcpu_copy(dpcpu, shdr[i].sh_size); ef->progtab[pb].addr = dpcpu; #ifdef VIMAGE } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) { void *vnet_data; vnet_data = vnet_data_alloc(shdr[i].sh_size); if (vnet_data == NULL) { error = ENOSPC; goto out; } memcpy(vnet_data, ef->progtab[pb].addr, ef->progtab[pb].size); vnet_data_copy(vnet_data, shdr[i].sh_size); ef->progtab[pb].addr = vnet_data; #endif } else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = ef->progtab[pb].addr; lf->ctors_size = shdr[i].sh_size; } /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } pb++; break; case SHT_REL: ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr; ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; rl++; break; case SHT_RELA: ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr; ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; ra++; break; } } if (pb != ef->nprogtab) { printf("%s: lost progbits\n", filename); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { printf("%s: lost reltab\n", filename); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { printf("%s: lost relatab\n", filename); error = ENOEXEC; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf); if (error != 0) goto out; *result = lf; return (0); out: /* preload not done this way */ linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } static void link_elf_invoke_ctors(caddr_t addr, size_t size) { void (**ctor)(void); size_t i, cnt; if (addr == NULL || size == 0) return; cnt = size / sizeof(*ctor); ctor = (void *)addr; for (i = 0; i < cnt; i++) { if (ctor[i] != NULL) (*ctor[i])(); } } static int link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; int error; ef = (elf_file_t)lf; error = relocate_file(ef); if (error) return error; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) return (error); /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); return (0); } static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return error; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd.ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) { link_elf_error(filename, "lost symbol table index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } if (symstrindex == -1) { link_elf_error(filename, "lost symbol string index"); error = ENOEXEC; goto out; } /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. */ #ifdef __amd64__ mapbase = KERNBASE; #else mapbase = VM_MIN_KERNEL_ADDRESS; #endif error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: #ifdef __amd64__ case SHT_X86_64_UNWIND: #endif alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; if (!strcmp(ef->progtab[pb].name, ".ctors")) { lf->ctors_addr = (caddr_t)mapbase; lf->ctors_size = shdr[i].sh_size; } } else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<>"; #ifdef __amd64__ else if (shdr[i].sh_type == SHT_X86_64_UNWIND) ef->progtab[pb].name = "<>"; #endif else ef->progtab[pb].name = "<>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS #ifdef __amd64__ || shdr[i].sh_type == SHT_X86_64_UNWIND #endif ) { error = vn_rdwr(UIO_READ, nd.ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) { link_elf_error(filename, "lost progbits"); error = ENOEXEC; goto out; } if (rl != ef->nreltab) { link_elf_error(filename, "lost reltab"); error = ENOEXEC; goto out; } if (ra != ef->nrelatab) { link_elf_error(filename, "lost relatab"); error = ENOEXEC; goto out; } if (mapbase != (vm_offset_t)ef->address + mapsize) { printf( "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", filename != NULL ? filename : "", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); error = ENOMEM; goto out; } /* Local intra-module relocations */ error = link_elf_reloc_local(lf); if (error != 0) goto out; /* Pull in dependencies */ VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; /* Invoke .ctors */ link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size); *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); free(hdr, M_LINKER); return error; } static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = (elf_file_t) file; int i; /* Notify MD code that a module is being unloaded. */ elf_cpu_unload_file(file); if (ef->progtab) { for (i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME)) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); #ifdef VIMAGE else if (!strcmp(ef->progtab[i].name, VNET_SETNAME)) vnet_data_free(ef->progtab[i].addr, ef->progtab[i].size); #endif } } if (ef->preloaded) { free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); if (file->filename != NULL) preload_delete_name(file->filename); /* XXX reclaim module memory? */ return; } for (i = 0; i < ef->nreltab; i++) free(ef->reltab[i].rel, M_LINKER); for (i = 0; i < ef->nrelatab; i++) free(ef->relatab[i].rela, M_LINKER); free(ef->reltab, M_LINKER); free(ef->relatab, M_LINKER); free(ef->progtab, M_LINKER); if (ef->object) { vm_map_remove(kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); } free(ef->e_shdr, M_LINKER); free(ef->ddbsymtab, M_LINKER); free(ef->ddbstrtab, M_LINKER); free(ef->shstrtab, M_LINKER); free(ef->ctftab, M_LINKER); free(ef->ctfoff, M_LINKER); free(ef->typoff, M_LINKER); } static const char * symbol_name(elf_file_t ef, Elf_Size r_info) { const Elf_Sym *ref; if (ELF_R_SYM(r_info)) { ref = ef->ddbsymtab + ELF_R_SYM(r_info); return ef->ddbstrtab + ref->st_name; } else return NULL; } static Elf_Addr findbase(elf_file_t ef, int sec) { int i; Elf_Addr base = 0; for (i = 0; i < ef->nprogtab; i++) { if (sec == ef->progtab[i].sec) { base = (Elf_Addr)ef->progtab[i].addr; break; } } return base; } static int relocate_file(elf_file_t ef) { const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const char *symname; const Elf_Sym *sym; int i; Elf_Size symidx; Elf_Addr base; /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab!"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL, elf_obj_lookup)) { symname = symbol_name(ef, rel->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for relatab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Local relocs are already done */ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) continue; if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup)) { symname = symbol_name(ef, rela->r_info); printf("link_elf_obj: symbol %s undefined\n", symname); return (ENOENT); } } } /* * Only clean SHN_FBSD_CACHED for successfull return. If we * modified symbol table for the object but found an * unresolved symbol, there is no reason to roll back. */ elf_obj_cleanup_globals_cache(ef); return (0); } static int link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym *symp; const char *strp; int i; for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { strp = ef->ddbstrtab + symp->st_name; if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) { *sym = (c_linker_sym_t) symp; return 0; } } return ENOENT; } static int link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym, linker_symval_t *symval) { elf_file_t ef = (elf_file_t) lf; const Elf_Sym *es = (const Elf_Sym*) sym; if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) { symval->name = ef->ddbstrtab + es->st_name; symval->value = (caddr_t)es->st_value; symval->size = es->st_size; return 0; } return ENOENT; } static int link_elf_search_symbol(linker_file_t lf, caddr_t value, c_linker_sym_t *sym, long *diffp) { elf_file_t ef = (elf_file_t) lf; u_long off = (uintptr_t) (void *) value; u_long diff = off; u_long st_value; const Elf_Sym *es; - const Elf_Sym *best = 0; + const Elf_Sym *best = NULL; int i; for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { if (es->st_name == 0) continue; st_value = es->st_value; if (off >= st_value) { if (off - st_value < diff) { diff = off - st_value; best = es; if (diff == 0) break; } else if (off - st_value == diff) { best = es; } } } - if (best == 0) + if (best == NULL) *diffp = off; else *diffp = diff; *sym = (c_linker_sym_t) best; return 0; } /* * Look up a linker set on an ELF system. */ static int link_elf_lookup_set(linker_file_t lf, const char *name, void ***startp, void ***stopp, int *countp) { elf_file_t ef = (elf_file_t)lf; void **start, **stop; int i, count; /* Relative to section number */ for (i = 0; i < ef->nprogtab; i++) { if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) && strcmp(ef->progtab[i].name + 4, name) == 0) { start = (void **)ef->progtab[i].addr; stop = (void **)((char *)ef->progtab[i].addr + ef->progtab[i].size); count = stop - start; if (startp) *startp = start; if (stopp) *stopp = stop; if (countp) *countp = count; return (0); } } return (ESRCH); } static int link_elf_each_function_name(linker_file_t file, int (*callback)(const char *, void *), void *opaque) { elf_file_t ef = (elf_file_t)file; const Elf_Sym *symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = callback(ef->ddbstrtab + symp->st_name, opaque); if (error) return (error); } } return (0); } static int link_elf_each_function_nameval(linker_file_t file, linker_function_nameval_callback_t callback, void *opaque) { linker_symval_t symval; elf_file_t ef = (elf_file_t)file; const Elf_Sym* symp; int i, error; /* Exhaustive search */ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { if (symp->st_value != 0 && ELF_ST_TYPE(symp->st_info) == STT_FUNC) { error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval); if (error) return (error); error = callback(file, i, &symval, opaque); if (error) return (error); } } return (0); } static void elf_obj_cleanup_globals_cache(elf_file_t ef) { Elf_Sym *sym; Elf_Size i; for (i = 0; i < ef->ddbsymcnt; i++) { sym = ef->ddbsymtab + i; if (sym->st_shndx == SHN_FBSD_CACHED) { sym->st_shndx = SHN_UNDEF; sym->st_value = 0; } } } /* * Symbol lookup function that can be used when the symbol index is known (ie * in relocations). It uses the symbol index instead of doing a fully fledged * hash table based lookup when such is valid. For example for local symbols. * This is not only more efficient, it's also more correct. It's not always * the case that the symbol can be found through the hash table. */ static int elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res) { elf_file_t ef = (elf_file_t)lf; Elf_Sym *sym; const char *symbol; Elf_Addr res1; /* Don't even try to lookup the symbol if the index is bogus. */ if (symidx >= ef->ddbsymcnt) { *res = 0; return (EINVAL); } sym = ef->ddbsymtab + symidx; /* Quick answer if there is a definition included. */ if (sym->st_shndx != SHN_UNDEF) { *res = sym->st_value; return (0); } /* If we get here, then it is undefined and needs a lookup. */ switch (ELF_ST_BIND(sym->st_info)) { case STB_LOCAL: /* Local, but undefined? huh? */ *res = 0; return (EINVAL); case STB_GLOBAL: case STB_WEAK: /* Relative to Data or Function name */ symbol = ef->ddbstrtab + sym->st_name; /* Force a lookup failure if the symbol name is bogus. */ if (*symbol == 0) { *res = 0; return (EINVAL); } res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps); /* * Cache global lookups during module relocation. The failure * case is particularly expensive for callers, who must scan * through the entire globals table doing strcmp(). Cache to * avoid doing such work repeatedly. * * After relocation is complete, undefined globals will be * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(), * above. */ if (res1 != 0) { sym->st_shndx = SHN_FBSD_CACHED; sym->st_value = res1; *res = res1; return (0); } else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { sym->st_value = 0; *res = 0; return (0); } return (EINVAL); default: return (EINVAL); } } static void link_elf_fix_link_set(elf_file_t ef) { static const char startn[] = "__start_"; static const char stopn[] = "__stop_"; Elf_Sym *sym; const char *sym_name, *linkset_name; Elf_Addr startp, stopp; Elf_Size symidx; int start, i; startp = stopp = 0; for (symidx = 1 /* zero entry is special */; symidx < ef->ddbsymcnt; symidx++) { sym = ef->ddbsymtab + symidx; if (sym->st_shndx != SHN_UNDEF) continue; sym_name = ef->ddbstrtab + sym->st_name; if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) { start = 1; linkset_name = sym_name + sizeof(startn) - 1; } else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) { start = 0; linkset_name = sym_name + sizeof(stopn) - 1; } else continue; for (i = 0; i < ef->nprogtab; i++) { if (strcmp(ef->progtab[i].name, linkset_name) == 0) { startp = (Elf_Addr)ef->progtab[i].addr; stopp = (Elf_Addr)(startp + ef->progtab[i].size); break; } } if (i == ef->nprogtab) continue; sym->st_value = start ? startp : stopp; sym->st_shndx = i; } } static int link_elf_reloc_local(linker_file_t lf) { elf_file_t ef = (elf_file_t)lf; const Elf_Rel *rellim; const Elf_Rel *rel; const Elf_Rela *relalim; const Elf_Rela *rela; const Elf_Sym *sym; Elf_Addr base; int i; Elf_Size symidx; link_elf_fix_link_set(ef); /* Perform relocations without addend if there are any: */ for (i = 0; i < ef->nreltab; i++) { rel = ef->reltab[i].rel; if (rel == NULL) { link_elf_error(ef->lf.filename, "lost a reltab"); return (ENOEXEC); } rellim = rel + ef->reltab[i].nrel; base = findbase(ef, ef->reltab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rel < rellim; rel++) { symidx = ELF_R_SYM(rel->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; elf_reloc_local(lf, base, rel, ELF_RELOC_REL, elf_obj_lookup); } } /* Perform relocations with addend if there are any: */ for (i = 0; i < ef->nrelatab; i++) { rela = ef->relatab[i].rela; if (rela == NULL) { link_elf_error(ef->lf.filename, "lost a relatab!"); return (ENOEXEC); } relalim = rela + ef->relatab[i].nrela; base = findbase(ef, ef->relatab[i].sec); if (base == 0) { link_elf_error(ef->lf.filename, "lost base for reltab"); return (ENOEXEC); } for ( ; rela < relalim; rela++) { symidx = ELF_R_SYM(rela->r_info); if (symidx >= ef->ddbsymcnt) continue; sym = ef->ddbsymtab + symidx; /* Only do local relocs */ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL) continue; elf_reloc_local(lf, base, rela, ELF_RELOC_RELA, elf_obj_lookup); } } return (0); } static long link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab) { elf_file_t ef = (elf_file_t)lf; *symtab = ef->ddbsymtab; if (*symtab == NULL) return (0); return (ef->ddbsymcnt); } static long link_elf_strtab_get(linker_file_t lf, caddr_t *strtab) { elf_file_t ef = (elf_file_t)lf; *strtab = ef->ddbstrtab; if (*strtab == NULL) return (0); return (ef->ddbstrcnt); } Index: head/sys/kern/sysv_sem.c =================================================================== --- head/sys/kern/sysv_sem.c (revision 298068) +++ head/sys/kern/sysv_sem.c (revision 298069) @@ -1,1661 +1,1661 @@ /*- * Implementation of SVID semaphores * * Author: Daniel Boulet * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_sem, "System V semaphores support"); static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores"); #ifdef SEM_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static int seminit(void); static int sysvsem_modload(struct module *, int, void *); static int semunload(void); static void semexit_myhook(void *arg, struct proc *p); static int sysctl_sema(SYSCTL_HANDLER_ARGS); static int semvalid(int semid, struct semid_kernel *semakptr); #ifndef _SYS_SYSPROTO_H_ struct __semctl_args; int __semctl(struct thread *td, struct __semctl_args *uap); struct semget_args; int semget(struct thread *td, struct semget_args *uap); struct semop_args; int semop(struct thread *td, struct semop_args *uap); #endif static struct sem_undo *semu_alloc(struct thread *td); static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval); static void semundo_clear(int semid, int semnum); static struct mtx sem_mtx; /* semaphore global lock */ static struct mtx sem_undo_mtx; static int semtot = 0; static struct semid_kernel *sema; /* semaphore id pool */ static struct mtx *sema_mtx; /* semaphore id pool mutexes*/ static struct sem *sem; /* semaphore pool */ LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */ LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */ static int *semu; /* undo structure pool */ static eventhandler_tag semexit_tag; #define SEMUNDO_MTX sem_undo_mtx #define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX); #define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX); #define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how)); struct sem { u_short semval; /* semaphore value */ pid_t sempid; /* pid of last operation */ u_short semncnt; /* # awaiting semval > cval */ u_short semzcnt; /* # awaiting semval = 0 */ }; /* * Undo structure (one per process) */ struct sem_undo { LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */ struct proc *un_proc; /* owner of this structure */ short un_cnt; /* # of active entries */ struct undo { short un_adjval; /* adjust on exit values */ short un_num; /* semaphore # */ int un_id; /* semid */ unsigned short un_seq; } un_ent[1]; /* undo entries */ }; /* * Configuration parameters */ #ifndef SEMMNI #define SEMMNI 50 /* # of semaphore identifiers */ #endif #ifndef SEMMNS #define SEMMNS 340 /* # of semaphores in system */ #endif #ifndef SEMUME #define SEMUME 50 /* max # of undo entries per process */ #endif #ifndef SEMMNU #define SEMMNU 150 /* # of undo structures in system */ #endif /* shouldn't need tuning */ #ifndef SEMMSL #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif #ifndef SEMOPM #define SEMOPM 100 /* max # of operations per semop call */ #endif #define SEMVMX 32767 /* semaphore maximum value */ #define SEMAEM 16384 /* adjust on exit max value */ /* * Due to the way semaphore memory is allocated, we have to ensure that * SEMUSZ is properly aligned. */ #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) /* actual size of an undo structure */ #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) /* * Macro to find a particular sem_undo vector */ #define SEMU(ix) \ ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) /* * semaphore info struct */ struct seminfo seminfo = { SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ SEMMSL, /* max # of semaphores per id */ SEMOPM, /* max # of operations per semop call */ SEMUME, /* max # of undo entries per process */ SEMUSZ, /* size in bytes of undo structure */ SEMVMX, /* semaphore maximum value */ SEMAEM /* adjust on exit max value */ }; SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0, "Number of semaphore identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0, "Maximum number of semaphores in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0, "Maximum number of undo structures in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0, "Max semaphores per id"); SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0, "Max operations per semop call"); SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0, "Max undo entries per process"); SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0, "Size in bytes of undo structure"); SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0, "Semaphore maximum value"); SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0, "Adjust on exit max value"); SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_sema, "", "Semaphore id pool"); static struct syscall_helper_data sem_syscalls[] = { SYSCALL_INIT_HELPER(__semctl), SYSCALL_INIT_HELPER(semget), SYSCALL_INIT_HELPER(semop), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(semsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data sem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_semctl), SYSCALL32_INIT_HELPER_COMPAT(semget), SYSCALL32_INIT_HELPER_COMPAT(semop), SYSCALL32_INIT_HELPER(freebsd32_semsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl), #endif SYSCALL_INIT_LAST }; #endif static int seminit(void) { int i, error; sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK); sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM, M_WAITOK); sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM, M_WAITOK | M_ZERO); semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK); for (i = 0; i < seminfo.semmni; i++) { sema[i].u.sem_base = 0; sema[i].u.sem_perm.mode = 0; sema[i].u.sem_perm.seq = 0; #ifdef MAC mac_sysvsem_init(&sema[i]); #endif } for (i = 0; i < seminfo.semmni; i++) mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF); LIST_INIT(&semu_free_list); for (i = 0; i < seminfo.semmnu; i++) { struct sem_undo *suptr = SEMU(i); suptr->un_proc = NULL; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); } LIST_INIT(&semu_list); mtx_init(&sem_mtx, "sem", NULL, MTX_DEF); mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF); semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL, EVENTHANDLER_PRI_ANY); error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int semunload(void) { int i; /* XXXKIB */ if (semtot != 0) return (EBUSY); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(sem32_syscalls); #endif syscall_helper_unregister(sem_syscalls); EVENTHANDLER_DEREGISTER(process_exit, semexit_tag); #ifdef MAC for (i = 0; i < seminfo.semmni; i++) mac_sysvsem_destroy(&sema[i]); #endif free(sem, M_SEM); free(sema, M_SEM); free(semu, M_SEM); for (i = 0; i < seminfo.semmni; i++) mtx_destroy(&sema_mtx[i]); free(sema_mtx, M_SEM); mtx_destroy(&sem_mtx); mtx_destroy(&sem_undo_mtx); return (0); } static int sysvsem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = seminit(); if (error != 0) semunload(); break; case MOD_UNLOAD: error = semunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvsem_mod = { "sysvsem", &sysvsem_modload, NULL }; DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sysvsem, 1); /* * Allocate a new sem_undo structure for a process * (returns ptr to structure or NULL if no more room) */ static struct sem_undo * semu_alloc(struct thread *td) { struct sem_undo *suptr; SEMUNDO_LOCKASSERT(MA_OWNED); if ((suptr = LIST_FIRST(&semu_free_list)) == NULL) return (NULL); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_list, suptr, un_next); suptr->un_cnt = 0; suptr->un_proc = td->td_proc; return (suptr); } static int semu_try_free(struct sem_undo *suptr) { SEMUNDO_LOCKASSERT(MA_OWNED); if (suptr->un_cnt != 0) return (0); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); return (1); } /* * Adjust a particular entry for a particular proc */ static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval) { struct proc *p = td->td_proc; struct sem_undo *suptr; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); /* Look for and remember the sem_undo if the caller doesn't provide it */ suptr = *supptr; if (suptr == NULL) { LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) { *supptr = suptr; break; } } if (suptr == NULL) { if (adjval == 0) return(0); suptr = semu_alloc(td); if (suptr == NULL) return (ENOSPC); *supptr = suptr; } } /* * Look for the requested entry and adjust it (delete if adjval becomes * 0). */ sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid || sunptr->un_num != semnum) continue; if (adjval != 0) { adjval += sunptr->un_adjval; if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); } sunptr->un_adjval = adjval; if (sunptr->un_adjval == 0) { suptr->un_cnt--; if (i < suptr->un_cnt) suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; if (suptr->un_cnt == 0) semu_try_free(suptr); } return (0); } /* Didn't find the right entry - create it */ if (adjval == 0) return (0); if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); if (suptr->un_cnt != seminfo.semume) { sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; sunptr->un_id = semid; sunptr->un_num = semnum; sunptr->un_seq = semseq; } else return (EINVAL); return (0); } static void semundo_clear(int semid, int semnum) { struct sem_undo *suptr, *suptr1; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) { sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid) continue; if (semnum == -1 || sunptr->un_num == semnum) { suptr->un_cnt--; if (i < suptr->un_cnt) { suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; continue; } semu_try_free(suptr); } if (semnum != -1) break; } } } static int semvalid(int semid, struct semid_kernel *semakptr) { return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0); } /* * Note that the user-mode half of this passes a union, not a pointer. */ #ifndef _SYS_SYSPROTO_H_ struct __semctl_args { int semid; int semnum; int cmd; union semun *arg; }; #endif int sys___semctl(struct thread *td, struct __semctl_args *uap) { struct semid_ds dsbuf; union semun arg, semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsbuf, sizeof(dsbuf)); if (error) return (error); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: error = copyout(&dsbuf, arg.buf, sizeof(dsbuf)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } int kern_semctl(struct thread *td, int semid, int semnum, int cmd, union semun *arg, register_t *rval) { u_short *array; struct ucred *cred = td->td_ucred; int i, error; struct semid_ds *sbuf; struct semid_kernel *semakptr; struct mtx *sema_mtxp; u_short usval, count; int semidx; DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); array = NULL; switch(cmd) { case SEM_STAT: /* * For this command we assume semid is an array index * rather than an IPC id. */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm); mtx_unlock(sema_mtxp); return (0); } semidx = IPCID_TO_IX(semid); if (semidx < 0 || semidx >= seminfo.semmni) return (EINVAL); semakptr = &sema[semidx]; sema_mtxp = &sema_mtx[semidx]; if (cmd == IPC_RMID) mtx_lock(&sem_mtx); mtx_lock(sema_mtxp); #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif error = 0; *rval = 0; switch (cmd) { case IPC_RMID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; semakptr->u.sem_perm.cuid = cred->cr_uid; semakptr->u.sem_perm.uid = cred->cr_uid; semakptr->u.sem_perm.mode = 0; racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems); crfree(semakptr->cred); semakptr->cred = NULL; SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); #ifdef MAC mac_sysvsem_cleanup(semakptr); #endif wakeup(semakptr); for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) mtx_lock_flags(&sema_mtx[i], LOP_DUPOK); } for (i = semakptr->u.sem_base - sem; i < semtot; i++) sem[i] = sem[i + semakptr->u.sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) { sema[i].u.sem_base -= semakptr->u.sem_nsems; mtx_unlock(&sema_mtx[i]); } } semtot -= semakptr->u.sem_nsems; break; case IPC_SET: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; sbuf = arg->buf; semakptr->u.sem_perm.uid = sbuf->sem_perm.uid; semakptr->u.sem_perm.gid = sbuf->sem_perm.gid; semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode & ~0777) | (sbuf->sem_perm.mode & 0777); semakptr->u.sem_ctime = time_second; break; case IPC_STAT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); break; case GETNCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semncnt; break; case GETPID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].sempid; break; case GETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semval; break; case GETALL: /* * Unfortunately, callers of this function don't know * in advance how many semaphores are in this set. * While we could just allocate the maximum size array * and pass the actual size back to the caller, that * won't work for SETALL since we can't copyin() more * data than the user specified as we may return a * spurious EFAULT. * * Note that the number of semaphores in a set is * fixed for the life of that set. The only way that * the 'count' could change while are blocked in * malloc() is if this semaphore set were destroyed * and a new one created with the same index. * However, semvalid() will catch that due to the * sequence number unless exactly 0x8000 (or a * multiple thereof) semaphore sets for the same index * are created and destroyed while we are in malloc! * */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); mtx_lock(sema_mtxp); if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) array[i] = semakptr->u.sem_base[i].semval; mtx_unlock(sema_mtxp); error = copyout(array, arg->array, count * sizeof(*array)); mtx_lock(sema_mtxp); break; case GETZCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semzcnt; break; case SETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } if (arg->val < 0 || arg->val > seminfo.semvmx) { error = ERANGE; goto done2; } semakptr->u.sem_base[semnum].semval = arg->val; SEMUNDO_LOCK(); semundo_clear(semidx, semnum); SEMUNDO_UNLOCK(); wakeup(semakptr); break; case SETALL: /* * See comment on GETALL for why 'count' shouldn't change * and why we require a userland buffer. */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); error = copyin(arg->array, array, count * sizeof(*array)); mtx_lock(sema_mtxp); if (error) break; if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) { usval = array[i]; if (usval > seminfo.semvmx) { error = ERANGE; break; } semakptr->u.sem_base[i].semval = usval; } SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); wakeup(semakptr); break; default: error = EINVAL; break; } done2: mtx_unlock(sema_mtxp); if (cmd == IPC_RMID) mtx_unlock(&sem_mtx); if (array != NULL) free(array, M_TEMP); return(error); } #ifndef _SYS_SYSPROTO_H_ struct semget_args { key_t key; int nsems; int semflg; }; #endif int sys_semget(struct thread *td, struct semget_args *uap) { int semid, error = 0; int key = uap->key; int nsems = uap->nsems; int semflg = uap->semflg; struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&sem_mtx); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) && sema[semid].u.sem_perm.key == key) break; } if (semid < seminfo.semmni) { DPRINTF(("found public key\n")); if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } if ((error = ipcperm(td, &sema[semid].u.sem_perm, semflg & 0700))) { goto done2; } if (nsems > 0 && sema[semid].u.sem_nsems < nsems) { DPRINTF(("too small\n")); error = EINVAL; goto done2; } #ifdef MAC error = mac_sysvsem_check_semget(cred, &sema[semid]); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the semid_kernel\n")); if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { if (nsems <= 0 || nsems > seminfo.semmsl) { DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl)); error = EINVAL; goto done2; } if (nsems > seminfo.semmns - semtot) { DPRINTF(( "not enough semaphores left (need %d, got %d)\n", nsems, seminfo.semmns - semtot)); error = ENOSPC; goto done2; } for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0) break; } if (semid == seminfo.semmni) { DPRINTF(("no more semid_kernel's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NSEM, nsems); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("semid %d is available\n", semid)); mtx_lock(&sema_mtx[semid]); KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0, ("Lost semaphore %d", semid)); sema[semid].u.sem_perm.key = key; sema[semid].u.sem_perm.cuid = cred->cr_uid; sema[semid].u.sem_perm.uid = cred->cr_uid; sema[semid].u.sem_perm.cgid = cred->cr_gid; sema[semid].u.sem_perm.gid = cred->cr_gid; sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].cred = crhold(cred); sema[semid].u.sem_perm.seq = (sema[semid].u.sem_perm.seq + 1) & 0x7fff; sema[semid].u.sem_nsems = nsems; sema[semid].u.sem_otime = 0; sema[semid].u.sem_ctime = time_second; sema[semid].u.sem_base = &sem[semtot]; semtot += nsems; bzero(sema[semid].u.sem_base, sizeof(sema[semid].u.sem_base[0])*nsems); #ifdef MAC mac_sysvsem_create(cred, &sema[semid]); #endif mtx_unlock(&sema_mtx[semid]); DPRINTF(("sembase = %p, next = %p\n", sema[semid].u.sem_base, &sem[semtot])); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm); done2: mtx_unlock(&sem_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct semop_args { int semid; struct sembuf *sops; size_t nsops; }; #endif int sys_semop(struct thread *td, struct semop_args *uap) { #define SMALL_SOPS 8 struct sembuf small_sops[SMALL_SOPS]; int semid = uap->semid; size_t nsops = uap->nsops; struct sembuf *sops; struct semid_kernel *semakptr; - struct sembuf *sopptr = 0; - struct sem *semptr = 0; + struct sembuf *sopptr = NULL; + struct sem *semptr = NULL; struct sem_undo *suptr; struct mtx *sema_mtxp; size_t i, j, k; int error; int do_wakeup, do_undos; unsigned short seq; #ifdef SEM_DEBUG sops = NULL; #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); /* Allocate memory for sem_ops */ if (nsops <= SMALL_SOPS) sops = small_sops; else if (nsops > seminfo.semopm) { DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, nsops)); return (E2BIG); } else { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) { PROC_UNLOCK(td->td_proc); return (E2BIG); } PROC_UNLOCK(td->td_proc); } #endif sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); } if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error, uap->sops, sops, nsops * sizeof(sops[0]))); if (sops != small_sops) free(sops, M_SEM); return (error); } semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } seq = semakptr->u.sem_perm.seq; if (seq != IPCID_TO_SEQ(uap->semid)) { error = EINVAL; goto done2; } /* * Initial pass thru sops to see what permissions are needed. * Also perform any checks that don't need repeating on each * attempt to satisfy the request vector. */ j = 0; /* permission needed */ do_undos = 0; for (i = 0; i < nsops; i++) { sopptr = &sops[i]; if (sopptr->sem_num >= semakptr->u.sem_nsems) { error = EFBIG; goto done2; } if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0) do_undos = 1; j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A; } if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) { DPRINTF(("error = %d from ipaccess\n", error)); goto done2; } #ifdef MAC error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j); if (error != 0) goto done2; #endif /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already * performed are rolled back and we go to sleep until some other * process wakes us up. At this point, we start all over again. * * This ensures that from the perspective of other tasks, a set * of requests is atomic (never partially satisfied). */ for (;;) { do_wakeup = 0; error = 0; /* error return if necessary */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; DPRINTF(( "semop: semakptr=%p, sem_base=%p, " "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n", semakptr, semakptr->u.sem_base, semptr, sopptr->sem_num, semptr->semval, sopptr->sem_op, (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait")); if (sopptr->sem_op < 0) { if (semptr->semval + sopptr->sem_op < 0) { DPRINTF(("semop: can't do it now\n")); break; } else { semptr->semval += sopptr->sem_op; if (semptr->semval == 0 && semptr->semzcnt > 0) do_wakeup = 1; } } else if (sopptr->sem_op == 0) { if (semptr->semval != 0) { DPRINTF(("semop: not zero now\n")); break; } } else if (semptr->semval + sopptr->sem_op > seminfo.semvmx) { error = ERANGE; break; } else { if (semptr->semncnt > 0) do_wakeup = 1; semptr->semval += sopptr->sem_op; } } /* * Did we get through the entire vector? */ if (i >= nsops) goto done; /* * No ... rollback anything that we've already done */ DPRINTF(("semop: rollback 0 through %d\n", i-1)); for (j = 0; j < i; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; /* If we detected an error, return it */ if (error != 0) goto done2; /* * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ if (sopptr->sem_flg & IPC_NOWAIT) { error = EAGAIN; goto done2; } if (sopptr->sem_op == 0) semptr->semzcnt++; else semptr->semncnt++; DPRINTF(("semop: good night!\n")); error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH, "semwait", 0); DPRINTF(("semop: good morning (error=%d)!\n", error)); /* return code is checked below, after sem[nz]cnt-- */ /* * Make sure that the semaphore still exists */ seq = semakptr->u.sem_perm.seq; if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || seq != IPCID_TO_SEQ(uap->semid)) { error = EIDRM; goto done2; } /* * Renew the semaphore's pointer after wakeup since * during msleep sem_base may have been modified and semptr * is not valid any more */ semptr = &semakptr->u.sem_base[sopptr->sem_num]; /* * The semaphore is still alive. Readjust the count of * waiting processes. */ if (sopptr->sem_op == 0) semptr->semzcnt--; else semptr->semncnt--; /* * Is it really morning, or was our sleep interrupted? * (Delayed check of msleep() return code because we * need to decrement sem[nz]cnt either way.) */ if (error != 0) { error = EINTR; goto done2; } DPRINTF(("semop: good morning!\n")); } done: /* * Process any SEM_UNDO requests. */ if (do_undos) { SEMUNDO_LOCK(); suptr = NULL; for (i = 0; i < nsops; i++) { /* * We only need to deal with SEM_UNDO's for non-zero * op's. */ int adjval; if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; error = semundo_adjust(td, &suptr, semid, seq, sops[i].sem_num, -adjval); if (error == 0) continue; /* * Oh-Oh! We ran out of either sem_undo's or undo's. * Rollback the adjustments to this point and then * rollback the semaphore ups and down so we can return * with an error with all structures restored. We * rollback the undo's in the exact reverse order that * we applied them. This guarantees that we won't run * out of space as we roll things back out. */ for (j = 0; j < i; j++) { k = i - j - 1; if ((sops[k].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[k].sem_op; if (adjval == 0) continue; if (semundo_adjust(td, &suptr, semid, seq, sops[k].sem_num, adjval) != 0) panic("semop - can't undo undos"); } for (j = 0; j < nsops; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; DPRINTF(("error = %d from semundo_adjust\n", error)); SEMUNDO_UNLOCK(); goto done2; } /* loop through the sops */ SEMUNDO_UNLOCK(); } /* if (do_undos) */ /* We're definitely done - set the sempid's and time */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; semptr->sempid = td->td_proc->p_pid; } semakptr->u.sem_otime = time_second; /* * Do a wakeup if any semaphore was up'd whilst something was * sleeping on it. */ if (do_wakeup) { DPRINTF(("semop: doing wakeup\n")); wakeup(semakptr); DPRINTF(("semop: back from wakeup\n")); } DPRINTF(("semop: done\n")); td->td_retval[0] = 0; done2: mtx_unlock(sema_mtxp); if (sops != small_sops) free(sops, M_SEM); return (error); } /* * Go through the undo structures for this process and apply the adjustments to * semaphores. */ static void semexit_myhook(void *arg, struct proc *p) { struct sem_undo *suptr; struct semid_kernel *semakptr; struct mtx *sema_mtxp; int semid, semnum, adjval, ix; unsigned short seq; /* * Go through the chain of undo vectors looking for one * associated with this process. */ SEMUNDO_LOCK(); LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) break; } if (suptr == NULL) { SEMUNDO_UNLOCK(); return; } LIST_REMOVE(suptr, un_next); DPRINTF(("proc @%p has undo structure with %d entries\n", p, suptr->un_cnt)); /* * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { SEMUNDO_UNLOCK(); for (ix = 0; ix < suptr->un_cnt; ix++) { semid = suptr->un_ent[ix].un_id; semnum = suptr->un_ent[ix].un_num; adjval = suptr->un_ent[ix].un_adjval; seq = suptr->un_ent[ix].un_seq; semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || (semakptr->u.sem_perm.seq != seq)) { mtx_unlock(sema_mtxp); continue; } if (semnum >= semakptr->u.sem_nsems) panic("semexit - semnum out of range"); DPRINTF(( "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n", suptr->un_proc, suptr->un_ent[ix].un_id, suptr->un_ent[ix].un_num, suptr->un_ent[ix].un_adjval, semakptr->u.sem_base[semnum].semval)); if (adjval < 0 && semakptr->u.sem_base[semnum].semval < -adjval) semakptr->u.sem_base[semnum].semval = 0; else semakptr->u.sem_base[semnum].semval += adjval; wakeup(semakptr); DPRINTF(("semexit: back from wakeup\n")); mtx_unlock(sema_mtxp); } SEMUNDO_LOCK(); } /* * Deallocate the undo vector. */ DPRINTF(("removing vector\n")); suptr->un_proc = NULL; suptr->un_cnt = 0; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); SEMUNDO_UNLOCK(); } static int sysctl_sema(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, sema, sizeof(struct semid_kernel) * seminfo.semmni)); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *semcalls[] = { (sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget, (sy_call_t *)sys_semop }; /* * Entry point for all SEM calls. */ int sys_semsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct semsys_args /* { int which; int a2; int a3; int a4; int a5; } */ *uap; { int error; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) return (EINVAL); error = (*semcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7___semctl_args { int semid; int semnum; int cmd; union semun_old *arg; }; #endif int freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap) { struct semid_ds_old dsold; struct semid_ds dsbuf; union semun_old arg; union semun semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsold, sizeof(dsold)); if (error) return (error); ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm); CP(dsold, dsbuf, sem_base); CP(dsold, dsbuf, sem_nsems); CP(dsold, dsbuf, sem_otime); CP(dsold, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsold, sizeof(dsold)); ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm); CP(dsbuf, dsold, sem_base); CP(dsbuf, dsold, sem_nsems); CP(dsbuf, dsold, sem_otime); CP(dsbuf, dsold, sem_ctime); error = copyout(&dsold, arg.buf, sizeof(dsold)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD{4,5,6,7} */ #ifdef COMPAT_FREEBSD32 int freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) switch (uap->which) { case 0: return (freebsd7_freebsd32_semctl(td, (struct freebsd7_freebsd32_semctl_args *)&uap->a2)); default: return (sys_semsys(td, (struct semsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_semctl(struct thread *td, struct freebsd7_freebsd32_semctl_args *uap) { struct semid_ds32_old dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif int freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap) { struct semid_ds32 dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD32 */ Index: head/sys/kern/uipc_mbuf.c =================================================================== --- head/sys/kern/uipc_mbuf.c (revision 298068) +++ head/sys/kern/uipc_mbuf.c (revision 298069) @@ -1,1869 +1,1869 @@ /*- * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "void*", "void*"); SDT_PROBE_DEFINE(sdt, , , m__cljset); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, "struct mbuf *", "mbufinfo_t *"); #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 48); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); CTASSERT(sizeof(struct m_ext) == 28); #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS static struct mbuf m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, struct mbuf *m) { volatile u_int *refcnt; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n)); n->m_ext = m->m_ext; n->m_flags |= M_EXT; n->m_flags |= m->m_flags & M_RDONLY; /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt == 1) *refcnt += 1; else atomic_add_int(refcnt, 1); } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * Non-inlined part of m_init(). */ int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ - top = 0; + top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = (*f)(arg, mtod(m, caddr_t) + off, count); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in it's present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags++; /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, n->m_len); m->m_len += n->m_len; m->m_next = n->m_next; m_free(n); if (--curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; bcopy(mtod(n, void *), mtod(m, void *), n->m_len); bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, n2->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; m_free(n); m_free(n2); if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0; if (!(m0->m_flags & M_PKTHDR)) return (m0); if ((length == 0) || (length < -2)) return (m0); m_fixhdr(m0); /* Needed sanity check */ m_final = m_getcl(how, MT_DATA, M_PKTHDR); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; if (length == -1) length = 1 + (arc4random() & 255); while (progress < m0->m_pkthdr.len) { int fraglen; if (length > 0) fraglen = length; else fraglen = 1 + (arc4random() & 255); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (fraglen > MCLBYTES) fraglen = MCLBYTES; if (m_new == NULL) { m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t)); progress += fraglen; m_new->m_len = fraglen; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } m_freem(m0); m0 = m_final; return (m0); nospace: if (m_final) m_freem(m_final); /* Return the original chain on failure */ return (m0); } #endif /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_mbcoalesced++; #endif } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_clcoalesced++; #endif continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif Index: head/sys/kern/uipc_sockbuf.c =================================================================== --- head/sys/kern/uipc_sockbuf.c (revision 298068) +++ head/sys/kern/uipc_sockbuf.c (revision 298069) @@ -1,1332 +1,1332 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function pointer set by the AIO routines so that the socket buffer code * can call back into the AIO module if it is loaded. */ void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on socket buffers */ u_long sb_max = SB_MAX; u_long sb_max_adj = (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); /* * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. */ static void sbm_clrprotoflags(struct mbuf *m, int flags) { int mask; mask = ~M_PROTOFLAGS; if (flags & PRUS_NOTREADY) mask |= M_NOTREADY; while (m) { m->m_flags &= mask; m = m->m_next; } } /* * Mark ready "count" mbufs starting with "m". */ int sbready(struct sockbuf *sb, struct mbuf *m, int count) { u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; for (int i = 0; i < count; i++, m = m->m_next) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; } if (!blocker) return (EINPROGRESS); /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { KASSERT(m->m_flags & M_BLOCKED, ("%s: m %p !M_BLOCKED", __func__, m)); m->m_flags &= ~M_BLOCKED; sb->sb_acc += m->m_len; } sb->sb_fnrdy = m; return (0); } /* * Adjust sockbuf state reflecting allocation of m. */ void sballoc(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) { if (m->m_flags & M_NOTREADY) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; } else m->m_flags |= M_BLOCKED; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; sb->sb_mbcnt += MSIZE; sb->sb_mcnt += 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; sb->sb_ccnt += 1; } } /* * Adjust sockbuf state reflecting freeing of m. */ void sbfree(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; if (!(m->m_flags & M_NOTAVAIL)) sb->sb_acc -= m->m_len; if (m == sb->sb_fnrdy) { struct mbuf *n; KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= m->m_len; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; sb->sb_ccnt -= 1; } if (sb->sb_sndptr == m) { sb->sb_sndptr = NULL; sb->sb_sndptroff = 0; } if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= m->m_len; } /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system * that no more data is to be sent, by the protocol code (in case * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be * received, and will normally be applied to the socket by a protocol when it * detects that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantsendmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(struct socket *so) { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(struct socket *so) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep_sbt(&sb->sb_acc, &sb->sb_mtx, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo, 0, 0)); } int sblock(struct sockbuf *sb, int flags) { KASSERT((flags & SBL_VALID) == flags, ("sblock: flags invalid (0x%x)", flags)); if (flags & SBL_WAIT) { if ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR)) { sx_xlock(&sb->sb_sx); return (0); } return (sx_xlock_sig(&sb->sb_sx)); } else { if (sx_try_xlock(&sb->sb_sx) == 0) return (EWOULDBLOCK); return (0); } } void sbunlock(struct sockbuf *sb) { sx_xunlock(&sb->sb_sx); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(struct socket *so, struct sockbuf *sb) { int ret; SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(&sb->sb_sel, PSOCK); if (!SEL_WAITING(&sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } KNOTE_LOCKED(&sb->sb_sel.si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); soupcall_clear(so, SO_RCV); } } else ret = SU_OK; if (sb->sb_flags & SB_AIO) sowakeup_aio(so, sb); SOCKBUF_UNLOCK(sb); if (ret == SU_ISCONNECTED) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and one for * receiving data. Each buffer contains a queue of mbufs, information about * the number of mbufs and amount of data in the queue, and other fields * allowing select() statements and notification on data availability to be * implemented. * * Data stored in a socket buffer is maintained as a list of records. Each * record is a list of mbufs chained together with the m_next field. Records * are chained together with the m_nextpkt field. The upper level routine * soreceive() expects the following conventions to be observed when placing * information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's name, * then a record containing that name must be present before any * associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really just * additional data associated with the message), and there are ``rights'' * to be received, then a record containing this data should be present * (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by a data * record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space should * be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_sb_max = sb_max; error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); if (error || !req->newptr) return (error); if (tmp_sb_max < MSIZE + MCLBYTES) return (EINVAL); sb_max = tmp_sb_max; sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't * become limiting if buffering efficiency is near the normal case. */ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * When a thread is passed, we take into account the thread's socket * buffer size limit. The caller will generally pass curthread, but * in the TCP input path, NULL will be passed to indicate that no * appropriate thread resource limits are available. In that case, * we don't apply a process limit. */ if (cc > sb_max_adj) return (0); if (td != NULL) { sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { int error; SOCKBUF_LOCK(sb); error = sbreserve_locked(sb, cc, so, td); SOCKBUF_UNLOCK(sb); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_internal(struct sockbuf *sb, struct socket *so) { sbflush_internal(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease_locked(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK_ASSERT(sb); sbrelease_internal(sb, so); } void sbrelease(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } void sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); } /* * Routines to add and remove data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to append * new mbufs to a socket buffer, after checking that adequate space is * available, comparing the function sbspace() with the amount of data to be * added. sbappendrecord() differs from sbappend() in that data supplied is * treated as the beginning of a new record. To place a sender's address, * optional access rights, and data in a socket receive buffer, * sbappendaddr() should be used. To place access rights and data in a * socket receive buffer, sbappendrights() should be used. In either case, * the new data begins a new record. Note that unlike sbappend() and * sbappendrecord(), these routines check for the caller that there will be * enough space to store the data. Each fails if there is not enough space, * or if it cannot find mbufs to store additional information in. * * Reliable protocols may use the socket send buffer to hold data awaiting * acknowledgement. Data is normally copied from a socket send buffer in a * protocol with m_copy for output to a peer, and then removing the data from * the socket buffer with sbdrop() or sbdroprecord() when the data is * acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) { struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); - if (m == 0) + if (m == NULL) return; sbm_clrprotoflags(m, flags); SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappend_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; SOCKBUF_LOCK_ASSERT(sb); acc = ccc = mbcnt = 0; fnrdy = NULL; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { if (m->m_len == 0) { printf("sb %p empty mbuf %p\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { if (m != sb->sb_fnrdy) { printf("sb %p: fnrdy %p != m %p\n", sb, sb->sb_fnrdy, m); goto fail; } fnrdy = m; } if (fnrdy) { if (!(m->m_flags & M_NOTAVAIL)) { printf("sb %p: fnrdy %p, m %p is avail\n", sb, sb->sb_fnrdy, m); goto fail; } } else acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); goto fail; } return; fail: panic("%s from %s:%u", __func__, file, line); } #endif /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); - if (m0 == 0) + if (m0 == NULL) return; m_clrprotoflags(m0); /* * Put the first mbuf on the queue. Note this permits zero length * records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } /* always call sbcompress() so it can do SBLASTMBUFCHK() */ sbcompress(sb, m, m0); } /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* Helper routine that appends data, control, and address to a sockbuf. */ static int sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) { struct mbuf *m, *n, *nlast; #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif m = m_get(M_NOWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (m0) m_clrprotoflags(m0); if (ctrl_last) ctrl_last->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if insufficient mbufs. Does not validate space * on the receiving sockbuf. */ int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; SOCKBUF_LOCK_ASSERT(sb); ctrl_last = (control == NULL) ? NULL : m_last(control); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); - if (control == 0) + if (control == NULL) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); m_clrprotoflags(m0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendcontrol_locked(sb, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } /* * Append the data in mbuf chain (m) into the socket buffer sb following mbuf * (n). If (n) is NULL, the buffer is presumed empty. * * When the data is compressed, mbufs in the chain may be handled in one of * three ways: * * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no * record boundary, and no change in data type). * * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into * an mbuf already in the socket buffer. This can occur if an * appropriate mbuf exists, there is room, both mbufs are not marked as * not ready, and no merging of data types will occur. * * (3) The mbuf may be appended to the end of the existing mbuf chain. * * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as * end-of-record. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && !(n->m_flags & M_NOTREADY) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, (unsigned)m->m_len); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) sb->sb_acc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); n->m_flags |= eor; } SBLASTMBUFCHK(sb); } /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ static void sbflush_internal(struct sockbuf *sb) { while (sb->sb_mbcnt) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); } KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); } void sbflush_locked(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sbflush_internal(sb); } void sbflush(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Cut data from (the front of) a sockbuf. */ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; next = (m = sb->sb_mb) ? m->m_nextpkt : 0; mfree = NULL; while (len > 0) { if (m == NULL) { KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; } if (m->m_len > len) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p M_NOTAVAIL", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; sb->sb_acc -= len; if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ if (m->m_flags & M_NOTREADY) m = m->m_next; else { struct mbuf *n; n = m->m_next; m->m_next = mfree; mfree = m; m = n; } } /* * Free any zero-length mbufs from the buffer. * For SOCK_DGRAM sockets such mbufs represent empty records. * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, * when sosend_generic() needs to send only control data. */ while (m && m->m_len == 0) { struct mbuf *n; sbfree(sb, m); n = m->m_next; m->m_next = mfree; mfree = m; m = n; } if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure * sb_lastrecord is up-to-date if we dropped part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } return (mfree); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); m_freem(sbcut_internal(sb, len)); } /* * Drop data from (the front of) a sockbuf, * and return it to caller. */ struct mbuf * sbcut_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); return (sbcut_internal(sb, len)); } void sbdrop(struct sockbuf *sb, int len) { struct mbuf *mfree; SOCKBUF_LOCK(sb); mfree = sbcut_internal(sb, len); SOCKBUF_UNLOCK(sb); m_freem(mfree); } /* * Maintain a pointer and offset pair into the socket buffer mbuf chain to * avoid traversal of the entire socket buffer for larger offsets. */ struct mbuf * sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff) { struct mbuf *m, *ret; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); KASSERT(off + len <= sb->sb_acc, ("%s: beyond sb", __func__)); KASSERT(sb->sb_sndptroff <= sb->sb_acc, ("%s: sndptroff broken", __func__)); /* * Is off below stored offset? Happens on retransmits. * Just return, we can't help here. */ if (sb->sb_sndptroff > off) { *moff = off; return (sb->sb_mb); } /* Return closest mbuf in chain for current offset. */ *moff = off - sb->sb_sndptroff; m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb; if (*moff == m->m_len) { *moff = 0; sb->sb_sndptroff += m->m_len; m = ret = m->m_next; KASSERT(ret->m_len > 0, ("mbuf %p in sockbuf %p chain has no valid data", ret, sb)); } /* Advance by len to be as close as possible for the next transmit. */ for (off = off - sb->sb_sndptroff + len - 1; off > 0 && m != NULL && off >= m->m_len; m = m->m_next) { sb->sb_sndptroff += m->m_len; off -= m->m_len; } if (off > 0 && m == NULL) panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret); sb->sb_sndptr = m; return (ret); } /* * Return the first mbuf and the mbuf data offset for the provided * send offset without changing the "sb_sndptroff" field. */ struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); /* * If the "off" is below the stored offset, which happens on * retransmits, just use "sb_mb": */ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { m = sb->sb_mb; } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } while (off > 0 && m != NULL) { if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } *moff = off; return (m); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord_locked(struct sockbuf *sb) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data with the specified * type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level) { struct cmsghdr *cp; struct mbuf *m; if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) m = m_getcl(M_NOWAIT, MT_CONTROL, 0); else m = m_get(M_NOWAIT, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); /* * Don't leave the padding between the msg header and the * cmsg data and the padding after the cmsg data un-initialized. */ bzero(cp, CMSG_SPACE((u_int)size)); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note * that the xsockbuf structure, since it is always embedded in a socket, does * not include a self pointer nor a length. We make this entry point public * in case some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mcnt = sb->sb_mcnt; xsb->sb_ccnt = sb->sb_ccnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, "Socket buffer size waste factor"); Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 298068) +++ head/sys/kern/uipc_syscalls.c (revision 298069) @@ -1,1757 +1,1757 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include /* * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC * and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct socket *so; struct file *fp; int fd, error, type, oflag, fflag; AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); type = uap->type; oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, type, uap->protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, type, uap->protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bindat(td, uap) struct thread *td; struct bindat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } /* ARGSUSED */ int sys_listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, uap->backlog, td); fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), &headfp, &fflag); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); if (error != 0) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error != 0) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = 0; + sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int sys_connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_connectat(td, uap) struct thread *td; struct connectat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } error = getsock_cap(td, s, &rights, &fp, NULL); if (error != 0) return (error); so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) goto bad; } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int sys_shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } /* ARGSUSED */ int sys_setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int sys_getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 298068) +++ head/sys/kern/vfs_cluster.c (revision 298069) @@ -1,1077 +1,1077 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "Debug VFS clustering code"); #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 64; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); /* Page expended to mark partially backed buffers */ extern vm_page_t bogus_page; /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); if (bp == NULL) return (EBUSY); origblkno = lblkno; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rbp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct buf *bp, *tbp; daddr_t bn; off_t off; long tinc, tsize; int i, inc, j, k, toff; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != f_iosize %jd\n", size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(&cluster_pbuf_freecnt); - if (bp == 0) + if (bp == NULL) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; } else { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i == 0) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); vfs_drain_busy_pages(tbp); vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); for (k = 0; k < tbp->b_npages; k++) vm_page_sbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } else { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. The bo lock is not * required for the BKGRDINPROG check since it * can not be set without the buf lock. */ if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ off = tbp->b_offset; tsize = size; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; tsize > 0; j++) { toff = off & PAGE_MASK; tinc = tsize; if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); if ((tbp->b_pages[j]->valid & vm_page_bits(toff, tinc)) != 0) break; if (vm_page_xbusied(tbp->b_pages[j])) break; vm_object_pip_add(tbp->b_bufobj->bo_object, 1); vm_page_sbusy(tbp->b_pages[j]); off += tinc; tsize -= tinc; } if (tsize > 0) { clean_sbusy: vm_object_pip_add(tbp->b_bufobj->bo_object, -j); for (k = 0; k < j; k++) vm_page_sunbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); bqrelse(tbp); break; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); goto clean_sbusy; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if (m->valid == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) bp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ static void cluster_callback(bp) struct buf *bp; { struct buf *nbp, *tbp; int error = 0; /* * Must propogate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; if (buf_mapped(bp)) { pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } pbrelvp(bp); relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { int r = 0; switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags) { daddr_t lbn; int maxclen, cursize; int lblocksize; int async; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1, gbflags); } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; int i, j; int totalwritten = 0; int dbsize = btodb(size); if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; bo = &vp->v_bufobj; while (len > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ BO_LOCK(bo); if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); ++start_lbn; --len; continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { ++start_lbn; --len; continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } if (tbp->b_pin_count > 0) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = (vp->v_vflag & VV_MD) != 0 ? trypbuf(&cluster_pbuf_freecnt) : getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ if ((gbflags & GB_UNMAPPED) == 0 || (tbp->b_flags & B_VMIO) == 0) { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } else { bp->b_data = unmapped_buf; } bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ /* * If the adjacent data is not even in core it * can't need to be written. */ BO_LOCK(bo); if ((tbp = gbincore(bo, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) break; if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Do not pull in pinned buffers. */ if (tbp->b_pin_count > 0) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); if (i == 0) { vfs_drain_busy_pages(tbp); } else { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (vm_page_xbusied(m)) { VM_OBJECT_WUNLOCK( tbp->b_object); bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_sbusy(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } bp->b_bcount += size; bp->b_bufsize += size; /* * If any of the clustered buffers have their * B_BARRIER flag set, transfer that request to * the cluster. */ bp->b_flags |= (tbp->b_flags & B_BARRIER); tbp->b_flags &= ~(B_DONE | B_BARRIER); tbp->b_flags |= B_ASYNC; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_iocmd = BIO_WRITE; bundirty(tbp); reassignbuf(tbp); /* put on clean list */ bufobj_wref(tbp->b_bufobj); BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: head/sys/kern/vfs_export.c =================================================================== --- head/sys/kern/vfs_export.c (revision 298068) +++ head/sys/kern/vfs_export.c (revision 298069) @@ -1,520 +1,520 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure"); static struct radix_node_head *vfs_create_addrlist_af( struct radix_node_head **prnh, int off); static void vfs_free_addrlist(struct netexport *nep); static int vfs_free_netcred(struct radix_node *rn, void *w); static void vfs_free_addrlist_af(struct radix_node_head **prnh); static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, struct export_args *argp); static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *); /* * Network address lookup element */ struct netcred { struct radix_node netc_rnodes[2]; int netc_exflags; struct ucred *netc_anon; int netc_numsecflavors; int netc_secflavors[MAXSECFLAVORS]; }; /* * Network export information */ struct netexport { struct netcred ne_defexported; /* Default export */ struct radix_node_head *ne4; struct radix_node_head *ne6; }; /* * Build hash lists of net addresses and hang them off the mount point. * Called by vfs_export() to set up the lists of export addresses. */ static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, struct export_args *argp) { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; - struct sockaddr *saddr, *smask = 0; + struct sockaddr *saddr, *smask = NULL; #if defined(INET6) || defined(INET) int off; #endif int error; /* * XXX: This routine converts from a `struct xucred' * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This * operation is questionable; for example, what should be done * with fields like cr_uidinfo and cr_prison? Currently, this * routine does not touch them (leaves them as NULL). */ if (argp->ex_anon.cr_version != XUCRED_VERSION) { vfs_mount_error(mp, "ex_anon.cr_version: %d != %d", argp->ex_anon.cr_version, XUCRED_VERSION); return (EINVAL); } if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) { vfs_mount_error(mp, "MNT_DEFEXPORTED already set for mount %p", mp); return (EPERM); } np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = crget(); np->netc_anon->cr_uid = argp->ex_anon.cr_uid; crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups, argp->ex_anon.cr_groups); np->netc_anon->cr_prison = &prison0; prison_hold(np->netc_anon->cr_prison); np->netc_numsecflavors = argp->ex_numsecflavors; bcopy(argp->ex_secflavors, np->netc_secflavors, sizeof(np->netc_secflavors)); MNT_ILOCK(mp); mp->mnt_flag |= MNT_DEFEXPORTED; MNT_IUNLOCK(mp); return (0); } #if MSIZE <= 256 if (argp->ex_addrlen > MLEN) { vfs_mount_error(mp, "ex_addrlen %d is greater than %d", argp->ex_addrlen, MLEN); return (EINVAL); } #endif i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen))) goto out; if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) { error = EINVAL; vfs_mount_error(mp, "Invalid saddr->sa_family: %d"); goto out; } if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } rnh = NULL; switch (saddr->sa_family) { #ifdef INET case AF_INET: if ((rnh = nep->ne4) == NULL) { off = offsetof(struct sockaddr_in, sin_addr) << 3; rnh = vfs_create_addrlist_af(&nep->ne4, off); } break; #endif #ifdef INET6 case AF_INET6: if ((rnh = nep->ne6) == NULL) { off = offsetof(struct sockaddr_in6, sin6_addr) << 3; rnh = vfs_create_addrlist_af(&nep->ne6, off); } break; #endif } if (rnh == NULL) { error = ENOBUFS; vfs_mount_error(mp, "%s %s %d", "Unable to initialize radix node head ", "for address family", saddr->sa_family); goto out; } RADIX_NODE_HEAD_LOCK(rnh); rn = (*rnh->rnh_addaddr)(saddr, smask, &rnh->rh, np->netc_rnodes); RADIX_NODE_HEAD_UNLOCK(rnh); if (rn == NULL || np != (struct netcred *)rn) { /* already exists */ error = EPERM; vfs_mount_error(mp, "netcred already exists for given addr/mask"); goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = crget(); np->netc_anon->cr_uid = argp->ex_anon.cr_uid; crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups, argp->ex_anon.cr_groups); np->netc_anon->cr_prison = &prison0; prison_hold(np->netc_anon->cr_prison); np->netc_numsecflavors = argp->ex_numsecflavors; bcopy(argp->ex_secflavors, np->netc_secflavors, sizeof(np->netc_secflavors)); return (0); out: free(np, M_NETADDR); return (error); } /* Helper for vfs_free_addrlist. */ /* ARGSUSED */ static int vfs_free_netcred(struct radix_node *rn, void *w) { struct radix_node_head *rnh = (struct radix_node_head *) w; struct ucred *cred; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, &rnh->rh); cred = ((struct netcred *)rn)->netc_anon; if (cred != NULL) crfree(cred); free(rn, M_NETADDR); return (0); } static struct radix_node_head * vfs_create_addrlist_af(struct radix_node_head **prnh, int off) { if (rn_inithead((void **)prnh, off) == 0) return (NULL); RADIX_NODE_HEAD_LOCK_INIT(*prnh); return (*prnh); } static void vfs_free_addrlist_af(struct radix_node_head **prnh) { struct radix_node_head *rnh; rnh = *prnh; RADIX_NODE_HEAD_LOCK(rnh); (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh); RADIX_NODE_HEAD_UNLOCK(rnh); RADIX_NODE_HEAD_DESTROY(rnh); rn_detachhead((void **)prnh); prnh = NULL; } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(struct netexport *nep) { struct ucred *cred; if (nep->ne4 != NULL) vfs_free_addrlist_af(&nep->ne4); if (nep->ne6 != NULL) vfs_free_addrlist_af(&nep->ne6); cred = nep->ne_defexported.netc_anon; if (cred != NULL) crfree(cred); } /* * High level function to manipulate export options on a mount point * and the passed in netexport. * Struct export_args *argp is the variable used to twiddle options, * the structure is described in sys/mount.h */ int vfs_export(struct mount *mp, struct export_args *argp) { struct netexport *nep; int error; if (argp->ex_numsecflavors < 0 || argp->ex_numsecflavors >= MAXSECFLAVORS) return (EINVAL); error = 0; lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL); nep = mp->mnt_export; if (argp->ex_flags & MNT_DELEXPORT) { if (nep == NULL) { error = ENOENT; goto out; } if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_EXPUBLIC; MNT_IUNLOCK(mp); } vfs_free_addrlist(nep); mp->mnt_export = NULL; free(nep, M_MOUNT); nep = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); MNT_IUNLOCK(mp); } if (argp->ex_flags & MNT_EXPORTED) { if (nep == NULL) { nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO); mp->mnt_export = nep; } if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) goto out; MNT_ILOCK(mp); mp->mnt_flag |= MNT_EXPUBLIC; MNT_IUNLOCK(mp); } if ((error = vfs_hang_addrlist(mp, nep, argp))) goto out; MNT_ILOCK(mp); mp->mnt_flag |= MNT_EXPORTED; MNT_IUNLOCK(mp); } out: lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); /* * Once we have executed the vfs_export() command, we do * not want to keep the "export" option around in the * options list, since that will cause subsequent MNT_UPDATE * calls to fail. The export information is saved in * mp->mnt_export, so we can safely delete the "export" mount option * here. */ vfs_deleteopt(mp->mnt_optnew, "export"); vfs_deleteopt(mp->mnt_opt, "export"); return (error); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(struct mount *mp, struct netexport *nep, struct export_args *argp) { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { free(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp))) return (error); if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { if (nfs_pub.np_index != NULL) nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { free(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } /* * Used by the filesystems to determine if a given network address * (passed in 'nam') is present in their exports list, returns a pointer * to struct netcred so that the filesystem can examine it for * access rights (read/write/etc). */ static struct netcred * vfs_export_lookup(struct mount *mp, struct sockaddr *nam) { struct netexport *nep; register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; nep = mp->mnt_export; if (nep == NULL) return (NULL); np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = NULL; switch (saddr->sa_family) { case AF_INET: rnh = nep->ne4; break; case AF_INET6: rnh = nep->ne6; break; } if (rnh != NULL) { RADIX_NODE_HEAD_RLOCK(rnh); np = (struct netcred *) (*rnh->rnh_matchaddr)(saddr, &rnh->rh); RADIX_NODE_HEAD_RUNLOCK(rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * XXX: This comment comes from the deprecated ufs_check_export() * XXX: and may not entirely apply, but lacking something better: * This is the generic part of fhtovp called after the underlying * filesystem has validated the file handle. * * Verify that a host should have access to a filesystem. */ int vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { struct netcred *np; lockmgr(&mp->mnt_explock, LK_SHARED, NULL); np = vfs_export_lookup(mp, nam); if (np == NULL) { lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); *credanonp = NULL; return (EACCES); } *extflagsp = np->netc_exflags; if ((*credanonp = np->netc_anon) != NULL) crhold(*credanonp); if (numsecflavors) *numsecflavors = np->netc_numsecflavors; if (secflavors) *secflavors = np->netc_secflavors; lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); return (0); } Index: head/sys/kern/vfs_lookup.c =================================================================== --- head/sys/kern/vfs_lookup.c (revision 298068) +++ head/sys/kern/vfs_lookup.c (revision 298069) @@ -1,1266 +1,1266 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *", "unsigned long"); SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); /* * Allocation zone for namei */ uma_zone_t namei_zone; /* * Placeholder vnode for mp traversal */ static struct vnode *vp_crossmp; static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); vn_lock(vp_crossmp, LK_EXCLUSIVE); VN_LOCK_ASHARE(vp_crossmp); VOP_UNLOCK(vp_crossmp, 0); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); static int lookup_shared = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0, "Enables/Disables shared locks for path name translation"); static void namei_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } static int namei_handle_root(struct nameidata *ndp, struct vnode **dpp) { struct componentname *cnp; cnp = &ndp->ni_cnd; if (ndp->ni_strictrelative != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif return (ENOTCAPABLE); } while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } *dpp = ndp->ni_rootdir; VREF(*dpp); return (0); } /* * Convert a pathname into a pointer to a locked vnode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ int namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; int error, linklen, startdir_used; struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, ("namei: nameiop contaminated with flags")); KASSERT((cnp->cn_flags & OPMASK) == 0, ("namei: flags contaminated with nameiops")); MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || ndp->ni_startdir->v_type == VBAD); if (!lookup_shared) cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; /* We will set this ourselves if we need it. */ cnp->cn_flags &= ~TRAILINGSLASH; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); /* * Don't allow empty pathnames. */ if (error == 0 && *cnp->cn_pnbuf == '\0') error = ENOENT; #ifdef CAPABILITY_MODE /* * In capability mode, lookups must be "strictly relative" (i.e. * not an absolute path, and not containing '..' components) to * a real file descriptor, not the pseudo-descriptor AT_FDCWD. */ if (error == 0 && IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { ndp->ni_strictrelative = 1; if (ndp->ni_dirfd == AT_FDCWD) { #ifdef KTRACE if (KTRPOINT(td, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ECAPMODE; } } #endif if (error != 0) { namei_cleanup_cnp(cnp); ndp->ni_vp = NULL; return (error); } ndp->ni_loopcnt = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { KASSERT(cnp->cn_thread == curthread, ("namei not using curthread")); ktrnamei(cnp->cn_pnbuf); } #endif /* * Get starting point for the translation. */ FILEDESC_SLOCK(fdp); ndp->ni_rootdir = fdp->fd_rdir; VREF(ndp->ni_rootdir); ndp->ni_topdir = fdp->fd_jdir; /* * If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf); startdir_used = 0; dp = NULL; cnp->cn_nameptr = cnp->cn_pnbuf; if (cnp->cn_pnbuf[0] == '/') { error = namei_handle_root(ndp, &dp); } else { if (ndp->ni_startdir != NULL) { dp = ndp->ni_startdir; startdir_used = 1; } else if (ndp->ni_dirfd == AT_FDCWD) { dp = fdp->fd_cdir; VREF(dp); } else { cap_rights_t rights; rights = ndp->ni_rightsneeded; cap_rights_set(&rights, CAP_LOOKUP); if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_ATFD1(ndp->ni_dirfd); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); error = fgetvp_rights(td, ndp->ni_dirfd, &rights, &ndp->ni_filecaps, &dp); if (error == EINVAL) error = ENOTDIR; #ifdef CAPABILITIES /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { ndp->ni_strictrelative = 1; } #endif } if (error == 0 && dp->v_type != VDIR) error = ENOTDIR; } FILEDESC_SUNLOCK(fdp); if (ndp->ni_startdir != NULL && !startdir_used) vrele(ndp->ni_startdir); if (error != 0) { if (dp != NULL) vrele(dp); vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); return (error); } SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, cnp->cn_flags); for (;;) { ndp->ni_startdir = dp; error = lookup(ndp); if (error != 0) { vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); SDT_PROBE2(vfs, namei, lookup, return, error, NULL); return (error); } /* * If not a symbolic link, we're done. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { vrele(ndp->ni_rootdir); if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { namei_cleanup_cnp(cnp); } else cnp->cn_flags |= HASBUF; SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp); return (0); } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); if (error != 0) break; } #endif if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error != 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENOENT; break; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENAMETOOLONG; break; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; vput(ndp->ni_vp); dp = ndp->ni_dvp; /* * Check if root directory should replace current directory. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); error = namei_handle_root(ndp, &dp); if (error != 0) { vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); return (error); } } } vrele(ndp->ni_rootdir); namei_cleanup_cnp(cnp); vput(ndp->ni_vp); ndp->ni_vp = NULL; vrele(ndp->ni_dvp); SDT_PROBE2(vfs, namei, lookup, return, error, NULL); return (error); } static int compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) { if (mp == NULL || ((lkflags & LK_SHARED) && (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) || ((cnflags & ISDOTDOT) && (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } lkflags |= LK_NODDLKTREAT; return (lkflags); } static __inline int needs_exclusive_leaf(struct mount *mp, int flags) { /* * Intermediate nodes can use shared locks, we only need to * force an exclusive lock for leaf nodes. */ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) return (0); /* Always use exclusive locks if LOCKSHARED isn't set. */ if (!(flags & LOCKSHARED)) return (1); /* * For lookups during open(), if the mount point supports * extended shared operations, then use a shared lock for the * leaf node, otherwise use an exclusive lock. */ if ((flags & ISOPEN) != 0) return (!MNT_EXTENDED_SHARED(mp)); /* * Lookup requests outside of open() that specify LOCKSHARED * only need a shared lock on the leaf vnode. */ return (0); } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int lookup(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ - struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *dp = NULL; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ struct prison *pr; int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ int relookup = 0; /* do not consume the path component */ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; /* * Setup: break out flag bits into variables. */ ni_dvp_unlocked = 0; wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; ndp->ni_dvp = NULL; /* * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ if (lookup_shared) cnp->cn_lkflags = LK_SHARED; else cnp->cn_lkflags = LK_EXCLUSIVE; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (cnp->cn_namelen > NAME_MAX) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { *ndp->ni_next = '\0'; cnp->cn_flags |= TRAILINGSLASH; } } ndp->ni_next = cp; cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; if (*ndp->ni_next == 0) cnp->cn_flags |= ISLASTCN; else cnp->cn_flags &= ~ISLASTCN; if ((cnp->cn_flags & ISLASTCN) != 0 && cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp, 0); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); goto success; } /* * Handle "..": five special cases. * 0. If doing a capability lookup, return ENOTCAPABLE (this is a * fairly conservative design choice, but it's the only one that we * are satisfied guarantees the property we're looking for). * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 3. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. */ if (cnp->cn_flags & ISDOTDOT) { if (ndp->ni_strictrelative != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ENOTCAPABLE; goto bad; } if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } for (;;) { for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dp == pr->pr_root) break; if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || pr != NULL || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; ndp->ni_vp = dp; VREF(dp); goto nextname; } if ((dp->v_vflag & VV_ROOT) == 0) break; if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ error = ENOENT; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, cnp); if (error) goto bad; } #endif ndp->ni_dvp = dp; ndp->ni_vp = NULL; ASSERT_VOP_LOCKED(dp, "lookup"); /* * If we have a shared lock we may need to upgrade the lock for the * last operation. */ if (dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED && (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) vn_lock(dp, LK_UPGRADE|LK_RETRY); if ((dp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; goto bad; } /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) cnp->cn_lkflags = LK_EXCLUSIVE; #ifdef NAMEI_DIAGNOSTIC vprint("lookup in", dp); #endif lkflags_save = cnp->cn_lkflags; cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, cnp->cn_flags); error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp); cnp->cn_lkflags = lkflags_save; if (error != 0) { KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); goto unionlookup; } if (error == ERELOOKUP) { vref(dp); ndp->ni_vp = dp; error = 0; relookup = 1; goto good; } if (error != EJUSTRETURN) goto bad; /* * At this point, we know we're at the end of the * pathname. If creating / renaming, we can consider * allowing the file or directory to be created / renamed, * provided we're not on a read-only filesystem. */ if (rdonly) { error = EROFS; goto bad; } /* trailing slash only allowed for directories */ if ((cnp->cn_flags & TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } goto success; } good: #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif dp = ndp->ni_vp; /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted filesystem. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0)) continue; vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vref(vp_crossmp); ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, cnp->cn_flags), &tdp); vfs_unbusy(mp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; } ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (dp->v_iflag & VI_DOOMED) { /* * We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = ENOENT; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } /* * Symlink code always expects an unlocked dvp. */ if (ndp->ni_dvp != ndp->ni_vp) { VOP_UNLOCK(ndp->ni_dvp, 0); ni_dvp_unlocked = 1; } goto success; } nextname: /* * Not a symbolic link that we will follow. Continue with the * next component if there is any; otherwise, we're done. */ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', ("lookup: invalid path state.")); if (relookup) { relookup = 0; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } /* * If we're processing a path with a trailing slash, * check that the end result is a directory. */ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) { ni_dvp_unlocked = 2; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { VOP_UNLOCK(ndp->ni_dvp, 0); ni_dvp_unlocked = 1; } if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0); success: /* * Because of lookup_shared we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { vn_lock(dp, LK_UPGRADE | LK_RETRY); if (dp->v_iflag & VI_DOOMED) { error = ENOENT; goto bad2; } } return (0); bad2: if (ni_dvp_unlocked != 2) { if (dp != ndp->ni_dvp && !ni_dvp_unlocked) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } bad: if (!dpunlocked) vput(dp); ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-acquire things. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { - struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *dp = NULL; /* the directory we are searching */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; KASSERT(cnp->cn_flags & ISLASTCN, ("relookup: Not given last component.")); /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); KASSERT(wantparent, ("relookup: parent not wanted.")); rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef NAMEI_DIAGNOSTIC printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for "" which represents the root directory after slash * removal. */ if (cnp->cn_nameptr[0] == '\0') { /* * Support only LOOKUP for "/" because lookup() * can't succeed for CREATE, DELETE and RENAME. */ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); KASSERT(dp->v_type == VDIR, ("dp is not a directory")); if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp, 0); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ #ifdef NAMEI_DIAGNOSTIC vprint("search in:", dp); #endif if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (dvp == dp) vrele(dvp); else vput(dvp); error = EROFS; goto bad; } /* * Set the parent lock/ref state to the requested state. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) { if (wantparent) VOP_UNLOCK(dvp, 0); else vput(dvp); } else if (!wantparent) vrele(dvp); /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0); return (0); bad: vput(dp); *vpp = NULL; return (error); } void NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg, const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp, struct thread *td) { ndp->ni_cnd.cn_nameiop = op; ndp->ni_cnd.cn_flags = flags; ndp->ni_segflg = segflg; ndp->ni_dirp = namep; ndp->ni_dirfd = dirfd; ndp->ni_startdir = startdir; ndp->ni_strictrelative = 0; if (rightsp != NULL) ndp->ni_rightsneeded = *rightsp; else cap_rights_init(&ndp->ni_rightsneeded); filecaps_init(&ndp->ni_filecaps); ndp->ni_cnd.cn_thread = td; } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(struct nameidata *ndp, const u_int flags) { int unlock_dvp; int unlock_vp; unlock_dvp = 0; unlock_vp = 0; if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) unlock_vp = 1; if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { if (unlock_vp) { vput(ndp->ni_vp); unlock_vp = 0; } else vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (unlock_vp) VOP_UNLOCK(ndp->ni_vp, 0); if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) unlock_dvp = 1; if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { if (unlock_dvp) { vput(ndp->ni_dvp); unlock_dvp = 0; } else vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (unlock_dvp) VOP_UNLOCK(ndp->ni_dvp, 0); if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Determine if there is a suitable alternate filename under the specified * prefix for the specified path. If the create flag is set, then the * alternate prefix will be used so long as the parent directory exists. * This is used by the various compatiblity ABIs so that Linux binaries prefer * files under /compat/linux for example. The chosen path (whether under * the prefix or under /) is returned in a kernel malloc'd buffer pointed * to by pathbuf. The caller is responsible for free'ing the buffer from * the M_TEMP bucket if one is returned. */ int kern_alternate_path(struct thread *td, const char *prefix, const char *path, enum uio_seg pathseg, char **pathbuf, int create, int dirfd) { struct nameidata nd, ndroot; char *ptr, *buf, *cp; size_t len, sz; int error; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pathbuf = buf; /* Copy the prefix into the new pathname as a starting point. */ len = strlcpy(buf, prefix, MAXPATHLEN); if (len >= MAXPATHLEN) { *pathbuf = NULL; free(buf, M_TEMP); return (EINVAL); } sz = MAXPATHLEN - len; ptr = buf + len; /* Append the filename to the prefix. */ if (pathseg == UIO_SYSSPACE) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { *pathbuf = NULL; free(buf, M_TEMP); return (error); } /* Only use a prefix with absolute pathnames. */ if (*ptr != '/') { error = EINVAL; goto keeporig; } if (dirfd != AT_FDCWD) { /* * We want the original because the "prefix" is * included in the already opened dirfd. */ bcopy(ptr, buf, len); return (0); } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (create) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); *cp = '/'; if (error != 0) goto keeporig; } else { NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error != 0) goto keeporig; /* * We now compare the vnode of the prefix to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, td); /* We shouldn't ever get an error from this namei(). */ error = namei(&ndroot); if (error == 0) { if (nd.ni_vp == ndroot.ni_vp) error = ENOENT; NDFREE(&ndroot, NDF_ONLY_PNBUF); vrele(ndroot.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); keeporig: /* If there was an error, use the original path name. */ if (error) bcopy(ptr, buf, len); return (error); } Index: head/sys/libkern/strtol.c =================================================================== --- head/sys/libkern/strtol.c (revision 298068) +++ head/sys/libkern/strtol.c (revision 298069) @@ -1,129 +1,129 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Torek. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)strtol.c 8.1 (Berkeley) 6/4/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Convert a string to a long integer. * * Ignores `locale' stuff. Assumes that the upper and lower case * alphabets and digits are each contiguous. */ long strtol(nptr, endptr, base) const char *nptr; char **endptr; int base; { const char *s = nptr; unsigned long acc; unsigned char c; unsigned long cutoff; int neg = 0, any, cutlim; /* * Skip white space and pick up leading +/- sign if any. * If base is 0, allow 0x for hex and 0 for octal, else * assume decimal; if base is already 16, allow 0x. */ do { c = *s++; } while (isspace(c)); if (c == '-') { neg = 1; c = *s++; } else if (c == '+') c = *s++; if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) { c = s[1]; s += 2; base = 16; } if (base == 0) base = c == '0' ? 8 : 10; /* * Compute the cutoff value between legal numbers and illegal * numbers. That is the largest legal value, divided by the * base. An input number that is greater than this value, if * followed by a legal input character, is too big. One that * is equal to this value may be valid or not; the limit * between valid and invalid numbers is then based on the last * digit. For instance, if the range for longs is * [-2147483648..2147483647] and the input base is 10, * cutoff will be set to 214748364 and cutlim to either * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated * a value > 214748364, or equal but the next digit is > 7 (or 8), * the number is too big, and we will return a range error. * * Set any if any `digits' consumed; make it negative to indicate * overflow. */ cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX; cutlim = cutoff % (unsigned long)base; cutoff /= (unsigned long)base; for (acc = 0, any = 0;; c = *s++) { if (!isascii(c)) break; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= base; acc += c; } } if (any < 0) { acc = neg ? LONG_MIN : LONG_MAX; } else if (neg) acc = -acc; - if (endptr != 0) + if (endptr != NULL) *endptr = __DECONST(char *, any ? s - 1 : nptr); return (acc); } Index: head/sys/libkern/strtoq.c =================================================================== --- head/sys/libkern/strtoq.c (revision 298068) +++ head/sys/libkern/strtoq.c (revision 298069) @@ -1,130 +1,130 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Torek. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Convert a string to a quad integer. * * Ignores `locale' stuff. Assumes that the upper and lower case * alphabets and digits are each contiguous. */ quad_t strtoq(const char *nptr, char **endptr, int base) { const char *s; u_quad_t acc; unsigned char c; u_quad_t qbase, cutoff; int neg, any, cutlim; /* * Skip white space and pick up leading +/- sign if any. * If base is 0, allow 0x for hex and 0 for octal, else * assume decimal; if base is already 16, allow 0x. */ s = nptr; do { c = *s++; } while (isspace(c)); if (c == '-') { neg = 1; c = *s++; } else { neg = 0; if (c == '+') c = *s++; } if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) { c = s[1]; s += 2; base = 16; } if (base == 0) base = c == '0' ? 8 : 10; /* * Compute the cutoff value between legal numbers and illegal * numbers. That is the largest legal value, divided by the * base. An input number that is greater than this value, if * followed by a legal input character, is too big. One that * is equal to this value may be valid or not; the limit * between valid and invalid numbers is then based on the last * digit. For instance, if the range for quads is * [-9223372036854775808..9223372036854775807] and the input base * is 10, cutoff will be set to 922337203685477580 and cutlim to * either 7 (neg==0) or 8 (neg==1), meaning that if we have * accumulated a value > 922337203685477580, or equal but the * next digit is > 7 (or 8), the number is too big, and we will * return a range error. * * Set any if any `digits' consumed; make it negative to indicate * overflow. */ qbase = (unsigned)base; cutoff = neg ? (u_quad_t)-(QUAD_MIN + QUAD_MAX) + QUAD_MAX : QUAD_MAX; cutlim = cutoff % qbase; cutoff /= qbase; for (acc = 0, any = 0;; c = *s++) { if (!isascii(c)) break; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= qbase; acc += c; } } if (any < 0) { acc = neg ? QUAD_MIN : QUAD_MAX; } else if (neg) acc = -acc; - if (endptr != 0) + if (endptr != NULL) *endptr = __DECONST(char *, any ? s - 1 : nptr); return (acc); } Index: head/sys/libkern/strtoul.c =================================================================== --- head/sys/libkern/strtoul.c (revision 298068) +++ head/sys/libkern/strtoul.c (revision 298069) @@ -1,108 +1,108 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Torek. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)strtoul.c 8.1 (Berkeley) 6/4/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Convert a string to an unsigned long integer. * * Ignores `locale' stuff. Assumes that the upper and lower case * alphabets and digits are each contiguous. */ unsigned long strtoul(nptr, endptr, base) const char *nptr; char **endptr; int base; { const char *s = nptr; unsigned long acc; unsigned char c; unsigned long cutoff; int neg = 0, any, cutlim; /* * See strtol for comments as to the logic used. */ do { c = *s++; } while (isspace(c)); if (c == '-') { neg = 1; c = *s++; } else if (c == '+') c = *s++; if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) { c = s[1]; s += 2; base = 16; } if (base == 0) base = c == '0' ? 8 : 10; cutoff = (unsigned long)ULONG_MAX / (unsigned long)base; cutlim = (unsigned long)ULONG_MAX % (unsigned long)base; for (acc = 0, any = 0;; c = *s++) { if (!isascii(c)) break; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= base; acc += c; } } if (any < 0) { acc = ULONG_MAX; } else if (neg) acc = -acc; - if (endptr != 0) + if (endptr != NULL) *endptr = __DECONST(char *, any ? s - 1 : nptr); return (acc); } Index: head/sys/libkern/strtouq.c =================================================================== --- head/sys/libkern/strtouq.c (revision 298068) +++ head/sys/libkern/strtouq.c (revision 298069) @@ -1,107 +1,107 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Torek. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include /* * Convert a string to an unsigned quad integer. * * Ignores `locale' stuff. Assumes that the upper and lower case * alphabets and digits are each contiguous. */ u_quad_t strtouq(const char *nptr, char **endptr, int base) { const char *s = nptr; u_quad_t acc; unsigned char c; u_quad_t qbase, cutoff; int neg, any, cutlim; /* * See strtoq for comments as to the logic used. */ do { c = *s++; } while (isspace(c)); if (c == '-') { neg = 1; c = *s++; } else { neg = 0; if (c == '+') c = *s++; } if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) { c = s[1]; s += 2; base = 16; } if (base == 0) base = c == '0' ? 8 : 10; qbase = (unsigned)base; cutoff = (u_quad_t)UQUAD_MAX / qbase; cutlim = (u_quad_t)UQUAD_MAX % qbase; for (acc = 0, any = 0;; c = *s++) { if (!isascii(c)) break; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= qbase; acc += c; } } if (any < 0) { acc = UQUAD_MAX; } else if (neg) acc = -acc; - if (endptr != 0) + if (endptr != NULL) *endptr = __DECONST(char *, any ? s - 1 : nptr); return (acc); }