Index: stable/4/sys/alpha/alpha/machdep.c =================================================================== --- stable/4/sys/alpha/alpha/machdep.c (revision 118739) +++ stable/4/sys/alpha/alpha/machdep.c (revision 118740) @@ -1,2230 +1,2231 @@ /*- * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center and by Chris G. Demetriou. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include "opt_compat.h" #include "opt_ddb.h" #include "opt_simos.h" #include "opt_msgbuf.h" #include "opt_maxmem.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct proc* curproc; struct proc* fpcurproc; struct pcb* curpcb; u_int64_t cycles_per_usec; u_int32_t cycles_per_sec; int whichqs, whichrtqs, whichidqs; int cold = 1; struct platform platform; alpha_chipset_t chipset; struct bootinfo_kernel bootinfo; struct timeval switchtime; int switchticks; struct user *proc0paddr; char machine[] = "alpha"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static char cpu_model[128]; SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, ""); #ifdef DDB /* start and end of kernel symbol table */ void *ksym_start, *ksym_end; #endif int alpha_unaligned_print = 1; /* warn about unaligned accesses */ int alpha_unaligned_fix = 1; /* fix up unaligned accesses */ int alpha_unaligned_sigbus = 0; /* don't SIGBUS on fixed-up accesses */ SYSCTL_INT(_machdep, CPU_UNALIGNED_PRINT, unaligned_print, CTLFLAG_RW, &alpha_unaligned_print, 0, ""); SYSCTL_INT(_machdep, CPU_UNALIGNED_FIX, unaligned_fix, CTLFLAG_RW, &alpha_unaligned_fix, 0, ""); SYSCTL_INT(_machdep, CPU_UNALIGNED_SIGBUS, unaligned_sigbus, CTLFLAG_RW, &alpha_unaligned_sigbus, 0, ""); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); struct msgbuf *msgbufp=0; int bootverbose = 0, Maxmem = 0; long dumplo; int totalphysmem; /* total amount of physical memory in system */ int physmem; /* physical memory used by NetBSD + some rsvd */ int resvmem; /* amount of memory reserved for PROM */ int unusedmem; /* amount of memory for OS that we don't use */ int unknownmem; /* amount of memory with an unknown use */ int ncpus; /* number of cpus */ vm_offset_t phys_avail[10]; static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, alpha_ptob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, alpha_ptob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); SYSCTL_INT(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, ""); /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static void identifycpu __P((void)); static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; /* * Hooked into the shutdown chain; if the system is to be halted, * unconditionally drop back to the SRM console. */ static void alpha_srm_shutdown(void *junk, int howto) { if (howto & RB_HALT) alpha_pal_halt(); } static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; vm_offset_t firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf("%s", version); identifycpu(); /* startrtclock(); */ #ifdef PERFMON perfmon_init(); #endif printf("real memory = %ld (%ldK bytes)\n", alpha_ptob(Maxmem), alpha_ptob(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; int kbytes = physmem * (PAGE_SIZE / 1024); nbuf = 50; if (kbytes > 4096) nbuf += min((kbytes - 4096) / factor, 65536 / factor); if (kbytes > 65536) nbuf += (kbytes - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } nswbuf = max(min(nbuf/4, 64), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (vm_offset_t)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size); mb_map->system_map = 1; + mbutltop = mbutl; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i]); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) #if defined(USERCONFIG_BOOT) if (1) #else if (boothowto & RB_CONFIG) #endif { userconfig(); cninit(); /* the preferred console may have changed */ } #endif printf("avail memory = %ld (%ldK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, alpha_srm_shutdown, 0, SHUTDOWN_PRI_LAST); } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } int unregister_netisr(num) int num; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("unregister_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = NULL; return (0); } /* * Retrieve the platform name from the DSR. */ const char * alpha_dsr_sysname() { struct dsrdb *dsr; const char *sysname; /* * DSR does not exist on early HWRPB versions. */ if (hwrpb->rpb_version < HWRPB_DSRDB_MINVERS) return (NULL); dsr = (struct dsrdb *)(((caddr_t)hwrpb) + hwrpb->rpb_dsrdb_off); sysname = (const char *)((caddr_t)dsr + (dsr->dsr_sysname_off + sizeof(u_int64_t))); return (sysname); } /* * Lookup the system specified system variation in the provided table, * returning the model string on match. */ const char * alpha_variation_name(u_int64_t variation, const struct alpha_variation_table *avtp) { int i; for (i = 0; avtp[i].avt_model != NULL; i++) if (avtp[i].avt_variation == variation) return (avtp[i].avt_model); return (NULL); } /* * Generate a default platform name based for unknown system variations. */ const char * alpha_unknown_sysname() { static char s[128]; /* safe size */ snprintf(s, sizeof(s), "%s family, unknown model variation 0x%lx", platform.family, hwrpb->rpb_variation & SV_ST_MASK); return ((const char *)s); } static void identifycpu(void) { u_int64_t type, major, minor; u_int64_t amask; struct pcs *pcsp; char *cpuname[] = { "unknown", /* 0 */ "EV3", /* 1 */ "EV4 (21064)", /* 2 */ "Simulation", /* 3 */ "LCA Family", /* 4 */ "EV5 (21164)", /* 5 */ "EV45 (21064A)", /* 6 */ "EV56 (21164A)", /* 7 */ "EV6 (21264)", /* 8 */ "PCA56 (21164PC)" /* 9 */ }; /* * print out CPU identification information. */ printf("%s\n%s, %ldMHz\n", platform.family, platform.model, hwrpb->rpb_cc_freq / 1000000); /* XXX true for 21164? */ printf("%ld byte page size, %d processor%s.\n", hwrpb->rpb_page_size, ncpus, ncpus == 1 ? "" : "s"); #if 0 /* this isn't defined for any systems that we run on? */ printf("serial number 0x%lx 0x%lx\n", ((long *)hwrpb->rpb_ssn)[0], ((long *)hwrpb->rpb_ssn)[1]); /* and these aren't particularly useful! */ printf("variation: 0x%lx, revision 0x%lx\n", hwrpb->rpb_variation, *(long *)hwrpb->rpb_revision); #endif pcsp = LOCATE_PCS(hwrpb, hwrpb->rpb_primary_cpu_id); /* cpu type */ type = pcsp->pcs_proc_type; major = (type & PCS_PROC_MAJOR) >> PCS_PROC_MAJORSHIFT; minor = (type & PCS_PROC_MINOR) >> PCS_PROC_MINORSHIFT; if (major < sizeof(cpuname)/sizeof(char *)) printf("CPU: %s major=%lu minor=%lu", cpuname[major], major, minor); else printf("CPU: major=%lu minor=%lu\n", major, minor); /* amask */ if (major >= PCS_PROC_EV56) { amask = 0xffffffff; /* 32 bit for printf */ amask = (~alpha_amask(amask)) & amask; printf(" extensions=0x%b\n", (u_int32_t) amask, "\020" "\001BWX" "\002FIX" "\003CIX" "\011MVI" "\012PRECISE" ); } else printf("\n"); /* PAL code */ printf("OSF PAL rev: 0x%lx\n", pcsp->pcs_palrevisions[PALvar_OSF1]); } extern char kernel_text[], _end[]; void alpha_init(pfn, ptb, bim, bip, biv) u_long pfn; /* first free PFN number */ u_long ptb; /* PFN of current level 1 page table */ u_long bim; /* bootinfo magic */ u_long bip; /* bootinfo pointer */ u_long biv; /* bootinfo version */ { int phys_avail_cnt; char *bootinfo_msg; vm_offset_t kernstart, kernend; vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1; struct mddt *mddtp; struct mddt_cluster *memc; int i, mddtweird; int cputype; char *p; /* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */ /* * Turn off interrupts (not mchecks) and floating point. * Make sure the instruction and data streams are consistent. */ (void)alpha_pal_swpipl(ALPHA_PSL_IPL_HIGH); /* alpha_pal_wrfen(0); */ ALPHA_TBIA(); alpha_pal_imb(); /* * Get critical system information (if possible, from the * information provided by the boot program). */ bootinfo_msg = NULL; if (bim == BOOTINFO_MAGIC) { if (biv == 0) { /* backward compat */ biv = *(u_long *)bip; bip += 8; } switch (biv) { case 1: { struct bootinfo_v1 *v1p = (struct bootinfo_v1 *)bip; bootinfo.ssym = v1p->ssym; bootinfo.esym = v1p->esym; bootinfo.kernend = v1p->kernend; bootinfo.modptr = v1p->modptr; bootinfo.envp = v1p->envp; /* hwrpb may not be provided by boot block in v1 */ if (v1p->hwrpb != NULL) { bootinfo.hwrpb_phys = ((struct rpb *)v1p->hwrpb)->rpb_phys; bootinfo.hwrpb_size = v1p->hwrpbsize; } else { bootinfo.hwrpb_phys = ((struct rpb *)HWRPB_ADDR)->rpb_phys; bootinfo.hwrpb_size = ((struct rpb *)HWRPB_ADDR)->rpb_size; } bcopy(v1p->boot_flags, bootinfo.boot_flags, min(sizeof v1p->boot_flags, sizeof bootinfo.boot_flags)); bcopy(v1p->booted_kernel, bootinfo.booted_kernel, min(sizeof v1p->booted_kernel, sizeof bootinfo.booted_kernel)); /* booted dev not provided in bootinfo */ init_prom_interface((struct rpb *) ALPHA_PHYS_TO_K0SEG(bootinfo.hwrpb_phys)); prom_getenv(PROM_E_BOOTED_DEV, bootinfo.booted_dev, sizeof bootinfo.booted_dev); break; } default: bootinfo_msg = "unknown bootinfo version"; goto nobootinfo; } } else { bootinfo_msg = "boot program did not pass bootinfo"; nobootinfo: bootinfo.ssym = (u_long)&_end; bootinfo.esym = (u_long)&_end; #ifdef SIMOS { char* p = (char*)bootinfo.ssym + 8; if (p[EI_MAG0] == ELFMAG0 && p[EI_MAG1] == ELFMAG1 && p[EI_MAG2] == ELFMAG2 && p[EI_MAG3] == ELFMAG3) { bootinfo.ssym = (u_long) p; bootinfo.esym = (u_long)p + *(u_long*)(p - 8); } } #endif bootinfo.hwrpb_phys = ((struct rpb *)HWRPB_ADDR)->rpb_phys; bootinfo.hwrpb_size = ((struct rpb *)HWRPB_ADDR)->rpb_size; init_prom_interface((struct rpb *)HWRPB_ADDR); prom_getenv(PROM_E_BOOTED_OSFLAGS, bootinfo.boot_flags, sizeof bootinfo.boot_flags); #ifndef SIMOS prom_getenv(PROM_E_BOOTED_FILE, bootinfo.booted_kernel, sizeof bootinfo.booted_kernel); #endif prom_getenv(PROM_E_BOOTED_DEV, bootinfo.booted_dev, sizeof bootinfo.booted_dev); } /* * Initialize the kernel's mapping of the RPB. It's needed for * lots of things. */ hwrpb = (struct rpb *)ALPHA_PHYS_TO_K0SEG(bootinfo.hwrpb_phys); /* * Remember how many cycles there are per microsecond, * so that we can use delay(). Round up, for safety. */ cycles_per_usec = (hwrpb->rpb_cc_freq + 999999) / 1000000; /* * Remember how many cycles per closk for coping with missed * clock interrupts. */ cycles_per_sec = hwrpb->rpb_cc_freq; /* Get the loader(8) metadata */ preload_metadata = (caddr_t)bootinfo.modptr; kern_envp = bootinfo.envp; /* Do basic tuning, hz etc */ init_param1(); /* * Initalize the (temporary) bootstrap console interface, so * we can use printf until the VM system starts being setup. * The real console is initialized before then. */ init_bootstrap_console(); /* OUTPUT NOW ALLOWED */ /* delayed from above */ if (bootinfo_msg) printf("WARNING: %s (0x%lx, 0x%lx, 0x%lx)\n", bootinfo_msg, bim, bip, biv); /* * Point interrupt/exception vectors to our own. */ alpha_pal_wrent(XentInt, ALPHA_KENTRY_INT); alpha_pal_wrent(XentArith, ALPHA_KENTRY_ARITH); alpha_pal_wrent(XentMM, ALPHA_KENTRY_MM); alpha_pal_wrent(XentIF, ALPHA_KENTRY_IF); alpha_pal_wrent(XentUna, ALPHA_KENTRY_UNA); alpha_pal_wrent(XentSys, ALPHA_KENTRY_SYS); /* * Clear pending machine checks and error reports, and enable * system- and processor-correctable error reporting. */ alpha_pal_wrmces(alpha_pal_rdmces() & ~(ALPHA_MCES_DSC|ALPHA_MCES_DPC)); /* * Find out what hardware we're on, and do basic initialization. */ cputype = hwrpb->rpb_type; if (cputype < 0) { /* * At least some white-box (NT) systems have SRM which * reports a systype that's the negative of their * blue-box (UNIX/OVMS) counterpart. */ cputype = -cputype; } if (cputype >= API_ST_BASE) { if (cputype >= napi_cpuinit + API_ST_BASE) { platform_not_supported(cputype); /* NOTREACHED */ } cputype -= API_ST_BASE; api_cpuinit[cputype].init(cputype); } else { if (cputype >= ncpuinit) { platform_not_supported(cputype); /* NOTREACHED */ } cpuinit[cputype].init(cputype); } snprintf(cpu_model, sizeof(cpu_model), "%s", platform.model); /* * Initalize the real console, so the the bootstrap console is * no longer necessary. */ if (platform.cons_init) platform.cons_init(); /* NO MORE FIRMWARE ACCESS ALLOWED */ #ifdef _PMAP_MAY_USE_PROM_CONSOLE /* * XXX (unless _PMAP_MAY_USE_PROM_CONSOLE is defined and * XXX pmap_uses_prom_console() evaluates to non-zero.) */ #endif /* * find out this system's page size */ if (hwrpb->rpb_page_size != PAGE_SIZE) panic("page size %ld != 8192?!", hwrpb->rpb_page_size); /* * Find the beginning and end of the kernel (and leave a * bit of space before the beginning for the bootstrap * stack). */ kernstart = trunc_page(kernel_text) - 2 * PAGE_SIZE; #ifdef DDB ksym_start = (void *)bootinfo.ssym; ksym_end = (void *)bootinfo.esym; kernend = (vm_offset_t)round_page(ksym_end); #else kernend = (vm_offset_t)round_page(_end); #endif /* But if the bootstrap tells us otherwise, believe it! */ if (bootinfo.kernend) kernend = round_page(bootinfo.kernend); p = getenv("kernelname"); if (p) strncpy(kernelname, p, sizeof(kernelname) - 1); kernstartpfn = atop(ALPHA_K0SEG_TO_PHYS(kernstart)); kernendpfn = atop(ALPHA_K0SEG_TO_PHYS(kernend)); #ifdef SIMOS /* * SimOS console puts the bootstrap stack after kernel */ kernendpfn += 4; #endif /* * Find out how much memory is available, by looking at * the memory cluster descriptors. This also tries to do * its best to detect things things that have never been seen * before... */ mddtp = (struct mddt *)(((caddr_t)hwrpb) + hwrpb->rpb_memdat_off); /* MDDT SANITY CHECKING */ mddtweird = 0; if (mddtp->mddt_cluster_cnt < 2) { mddtweird = 1; printf("WARNING: weird number of mem clusters: %ld\n", mddtp->mddt_cluster_cnt); } #ifdef DEBUG_CLUSTER printf("Memory cluster count: %d\n", mddtp->mddt_cluster_cnt); #endif phys_avail_cnt = 0; for (i = 0; i < mddtp->mddt_cluster_cnt; i++) { memc = &mddtp->mddt_clusters[i]; #ifdef DEBUG_CLUSTER printf("MEMC %d: pfn 0x%lx cnt 0x%lx usage 0x%lx\n", i, memc->mddt_pfn, memc->mddt_pg_cnt, memc->mddt_usage); #endif totalphysmem += memc->mddt_pg_cnt; if (memc->mddt_usage & MDDT_mbz) { mddtweird = 1; printf("WARNING: mem cluster %d has weird " "usage 0x%lx\n", i, memc->mddt_usage); unknownmem += memc->mddt_pg_cnt; continue; } if (memc->mddt_usage & MDDT_NONVOLATILE) { /* XXX should handle these... */ printf("WARNING: skipping non-volatile mem " "cluster %d\n", i); unusedmem += memc->mddt_pg_cnt; continue; } if (memc->mddt_usage & MDDT_PALCODE) { resvmem += memc->mddt_pg_cnt; continue; } /* * We have a memory cluster available for system * software use. We must determine if this cluster * holds the kernel. */ /* * XXX If the kernel uses the PROM console, we only use the * XXX memory after the kernel in the first system segment, * XXX to avoid clobbering prom mapping, data, etc. */ physmem += memc->mddt_pg_cnt; pfn0 = memc->mddt_pfn; pfn1 = memc->mddt_pfn + memc->mddt_pg_cnt; if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) { /* * Must compute the location of the kernel * within the segment. */ #ifdef DEBUG_CLUSTER printf("Cluster %d contains kernel\n", i); #endif if (!pmap_uses_prom_console()) { if (pfn0 < kernstartpfn) { /* * There is a chunk before the kernel. */ #ifdef DEBUG_CLUSTER printf("Loading chunk before kernel: " "0x%lx / 0x%lx\n", pfn0, kernstartpfn); #endif phys_avail[phys_avail_cnt] = alpha_ptob(pfn0); phys_avail[phys_avail_cnt+1] = alpha_ptob(kernstartpfn); phys_avail_cnt += 2; } } if (kernendpfn < pfn1) { /* * There is a chunk after the kernel. */ #ifdef DEBUG_CLUSTER printf("Loading chunk after kernel: " "0x%lx / 0x%lx\n", kernendpfn, pfn1); #endif phys_avail[phys_avail_cnt] = alpha_ptob(kernendpfn); phys_avail[phys_avail_cnt+1] = alpha_ptob(pfn1); phys_avail_cnt += 2; } } else { /* * Just load this cluster as one chunk. */ #ifdef DEBUG_CLUSTER printf("Loading cluster %d: 0x%lx / 0x%lx\n", i, pfn0, pfn1); #endif phys_avail[phys_avail_cnt] = alpha_ptob(pfn0); phys_avail[phys_avail_cnt+1] = alpha_ptob(pfn1); phys_avail_cnt += 2; } } phys_avail[phys_avail_cnt] = 0; /* * Dump out the MDDT if it looks odd... */ if (mddtweird) { printf("\n"); printf("complete memory cluster information:\n"); for (i = 0; i < mddtp->mddt_cluster_cnt; i++) { printf("mddt %d:\n", i); printf("\tpfn %lx\n", mddtp->mddt_clusters[i].mddt_pfn); printf("\tcnt %lx\n", mddtp->mddt_clusters[i].mddt_pg_cnt); printf("\ttest %lx\n", mddtp->mddt_clusters[i].mddt_pg_test); printf("\tbva %lx\n", mddtp->mddt_clusters[i].mddt_v_bitaddr); printf("\tbpa %lx\n", mddtp->mddt_clusters[i].mddt_p_bitaddr); printf("\tbcksum %lx\n", mddtp->mddt_clusters[i].mddt_bit_cksum); printf("\tusage %lx\n", mddtp->mddt_clusters[i].mddt_usage); } printf("\n"); } Maxmem = physmem; #ifdef MAXMEM /* * MAXMEM define is in kilobytes. */ Maxmem = alpha_btop(MAXMEM * 1024); #endif /* * hw.physmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((p = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(p, &ep, 0); if ((ep != p) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", p); else Maxmem = alpha_btop(AllowMem); } while (physmem > Maxmem) { int i = phys_avail_cnt - 2; size_t sz = alpha_btop(phys_avail[i+1] - phys_avail[i]); size_t nsz; if (physmem - sz > Maxmem) { phys_avail[i] = 0; phys_avail_cnt -= 2; } else { nsz = sz - (physmem - Maxmem); phys_avail[i+1] = phys_avail[i] + alpha_ptob(nsz); physmem -= (sz - nsz); } } init_param2(physmem); /* * Initialize error message buffer (at end of core). */ { size_t sz = round_page(MSGBUF_SIZE); int i = phys_avail_cnt - 2; /* shrink so that it'll fit in the last segment */ if (phys_avail[i+1] - phys_avail[i] < sz) sz = phys_avail[i+1] - phys_avail[i]; phys_avail[i+1] -= sz; msgbufp = (struct msgbuf*) ALPHA_PHYS_TO_K0SEG(phys_avail[i+1]); msgbufinit(msgbufp, sz); /* Remove the last segment if it now has no pages. */ if (phys_avail[i] == phys_avail[i+1]) phys_avail[i] = 0; /* warn if the message buffer had to be shrunk */ if (sz != round_page(MSGBUF_SIZE)) printf("WARNING: %ld bytes not available for msgbuf in last cluster (%ld used)\n", round_page(MSGBUF_SIZE), sz); } /* * Init mapping for u page(s) for proc 0 */ proc0.p_addr = proc0paddr = (struct user *)pmap_steal_memory(UPAGES * PAGE_SIZE); /* * Initialize the virtual memory system, and set the * page table base register in proc 0's PCB. */ pmap_bootstrap(ALPHA_PHYS_TO_K0SEG(alpha_ptob(ptb)), hwrpb->rpb_max_asn); /* * Initialize the rest of proc 0's PCB, and cache its physical * address. */ proc0.p_md.md_pcbpaddr = (struct pcb *)ALPHA_K0SEG_TO_PHYS((vm_offset_t)&proc0paddr->u_pcb); /* * Set the kernel sp, reserving space for an (empty) trapframe, * and make proc0's trapframe pointer point to it for sanity. */ proc0paddr->u_pcb.pcb_hw.apcb_ksp = (u_int64_t)proc0paddr + USPACE - sizeof(struct trapframe); proc0.p_md.md_tf = (struct trapframe *)proc0paddr->u_pcb.pcb_hw.apcb_ksp; /* * Initialise entropy pool. */ rand_initialize(); /* * Look at arguments passed to us and compute boothowto. */ #ifdef KADB boothowto |= RB_KDB; #endif /* boothowto |= RB_KDB | RB_GDB; */ for (p = bootinfo.boot_flags; p && *p != '\0'; p++) { /* * Note that we'd really like to differentiate case here, * but the Alpha AXP Architecture Reference Manual * says that we shouldn't. */ switch (*p) { case 'a': /* autoboot */ case 'A': boothowto &= ~RB_SINGLE; break; #ifdef DEBUG case 'c': /* crash dump immediately after autoconfig */ case 'C': boothowto |= RB_DUMP; break; #endif #if defined(DDB) case 'd': /* break into the kernel debugger ASAP */ case 'D': boothowto |= RB_KDB; break; case 'g': /* use kernel gdb */ case 'G': boothowto |= RB_GDB; break; #endif case 'h': /* always halt, never reboot */ case 'H': boothowto |= RB_HALT; break; #if 0 case 'm': /* mini root present in memory */ case 'M': boothowto |= RB_MINIROOT; break; #endif case 'n': /* askname */ case 'N': boothowto |= RB_ASKNAME; break; case 's': /* single-user (default, supported for sanity) */ case 'S': boothowto |= RB_SINGLE; break; case 'v': case 'V': boothowto |= RB_VERBOSE; bootverbose = 1; break; default: printf("Unrecognized boot flag '%c'.\n", *p); break; } } /* * Catch case of boot_verbose set in environment. */ if ((p = getenv("boot_verbose")) != NULL) { if (strcmp(p, "yes") == 0 || strcmp(p, "YES") == 0) { boothowto |= RB_VERBOSE; bootverbose = 1; } } /* * Initialize debuggers, and break into them if appropriate. */ #ifdef DDB kdb_init(); if (boothowto & RB_KDB) { printf("Boot flags requested debugger\n"); breakpoint(); } #endif /* * Figure out the number of cpus in the box, from RPB fields. * Really. We mean it. */ for (i = 0; i < hwrpb->rpb_pcs_cnt; i++) { struct pcs *pcsp; pcsp = (struct pcs *)((char *)hwrpb + hwrpb->rpb_pcs_off + (i * hwrpb->rpb_pcs_size)); if ((pcsp->pcs_flags & PCS_PP) != 0) ncpus++; } /* * Figure out our clock frequency, from RPB fields. */ hz = hwrpb->rpb_intr_freq >> 12; if (!(60 <= hz && hz <= 10240)) { hz = 1024; #ifdef DIAGNOSTIC printf("WARNING: unbelievable rpb_intr_freq: %ld (%d hz)\n", hwrpb->rpb_intr_freq, hz); #endif } alpha_pal_wrfen(0); } void bzero(void *buf, size_t len) { caddr_t p = buf; while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) { *p++ = 0; len--; } while (len >= sizeof(u_long) * 8) { *(u_long*) p = 0; *((u_long*) p + 1) = 0; *((u_long*) p + 2) = 0; *((u_long*) p + 3) = 0; len -= sizeof(u_long) * 8; *((u_long*) p + 4) = 0; *((u_long*) p + 5) = 0; *((u_long*) p + 6) = 0; *((u_long*) p + 7) = 0; p += sizeof(u_long) * 8; } while (len >= sizeof(u_long)) { *(u_long*) p = 0; len -= sizeof(u_long); p += sizeof(u_long); } while (len) { *p++ = 0; len--; } } void DELAY(int n) { #ifndef SIMOS unsigned long pcc0, pcc1, curcycle, cycles; int usec; if (n == 0) return; pcc0 = alpha_rpcc() & 0xffffffffUL; cycles = 0; usec = 0; while (usec <= n) { /* * Get the next CPU cycle count. The assumption here * is that we can't have wrapped twice past 32 bits worth * of CPU cycles since we last checked. */ pcc1 = alpha_rpcc() & 0xffffffffUL; if (pcc1 < pcc0) { curcycle = (pcc1 + 0x100000000UL) - pcc0; } else { curcycle = pcc1 - pcc0; } /* * We now have the number of processor cycles since we * last checked. Add the current cycle count to the * running total. If it's over cycles_per_usec, increment * the usec counter. */ cycles += curcycle; while (cycles > cycles_per_usec) { usec++; cycles -= cycles_per_usec; } pcc0 = pcc1; } #endif } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) { struct proc *p = curproc; osiginfo_t *sip, ksi; struct trapframe *frame; struct sigacts *psp = p->p_sigacts; int oonstack, fsize, rndfsize; frame = p->p_md.md_tf; oonstack = p->p_sigstk.ss_flags & SS_ONSTACK; fsize = sizeof ksi; rndfsize = ((fsize + 15) / 16) * 16; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sip = (osiginfo_t *)((caddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size - rndfsize); p->p_sigstk.ss_flags |= SS_ONSTACK; } else sip = (osiginfo_t *)(alpha_pal_rdusp() - rndfsize); (void)grow_stack(p, (u_long)sip); if (!useracc((caddr_t)sip, fsize, VM_PROT_WRITE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); return; } /* * Build the signal context to be used by sigreturn. */ ksi.si_sc.sc_onstack = oonstack; SIG2OSIG(*mask, ksi.si_sc.sc_mask); ksi.si_sc.sc_pc = frame->tf_regs[FRAME_PC]; ksi.si_sc.sc_ps = frame->tf_regs[FRAME_PS]; /* copy the registers. */ fill_regs(p, (struct reg *)ksi.si_sc.sc_regs); ksi.si_sc.sc_regs[R_ZERO] = 0xACEDBADE; /* magic number */ ksi.si_sc.sc_regs[R_SP] = alpha_pal_rdusp(); /* save the floating-point state, if necessary, then copy it. */ alpha_fpstate_save(p, 1); /* XXX maybe write=0 */ ksi.si_sc.sc_ownedfp = p->p_md.md_flags & MDP_FPUSED; bcopy(&p->p_addr->u_pcb.pcb_fp, (struct fpreg *)ksi.si_sc.sc_fpregs, sizeof(struct fpreg)); ksi.si_sc.sc_fp_control = p->p_addr->u_pcb.pcb_fp_control; bzero(ksi.si_sc.sc_reserved, sizeof ksi.si_sc.sc_reserved); /* XXX */ ksi.si_sc.sc_xxx1[0] = 0; /* XXX */ ksi.si_sc.sc_xxx1[1] = 0; /* XXX */ ksi.si_sc.sc_traparg_a0 = frame->tf_regs[FRAME_TRAPARG_A0]; ksi.si_sc.sc_traparg_a1 = frame->tf_regs[FRAME_TRAPARG_A1]; ksi.si_sc.sc_traparg_a2 = frame->tf_regs[FRAME_TRAPARG_A2]; ksi.si_sc.sc_xxx2[0] = 0; /* XXX */ ksi.si_sc.sc_xxx2[1] = 0; /* XXX */ ksi.si_sc.sc_xxx2[2] = 0; /* XXX */ /* Fill in POSIX parts */ ksi.si_signo = sig; ksi.si_code = code; ksi.si_value.sigval_ptr = NULL; /* XXX */ /* * copy the frame out to userland. */ (void) copyout((caddr_t)&ksi, (caddr_t)sip, fsize); /* * Set up the registers to return to sigcode. */ frame->tf_regs[FRAME_PC] = PS_STRINGS - (esigcode - sigcode); frame->tf_regs[FRAME_A0] = sig; if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) frame->tf_regs[FRAME_A1] = (u_int64_t)sip; else frame->tf_regs[FRAME_A1] = code; frame->tf_regs[FRAME_A2] = (u_int64_t)&sip->si_sc; frame->tf_regs[FRAME_T12] = (u_int64_t)catcher; /* t12 is pv */ alpha_pal_wrusp((unsigned long)sip); } void sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) { struct proc *p = curproc; struct trapframe *frame; struct sigacts *psp = p->p_sigacts; struct sigframe sf, *sfp; int oonstack, rndfsize; if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, sig, mask, code); return; } frame = p->p_md.md_tf; oonstack = (p->p_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; rndfsize = ((sizeof(sf) + 15) / 16) * 16; /* save user context */ bzero(&sf, sizeof(struct sigframe)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_mcontext.mc_onstack = oonstack; fill_regs(p, (struct reg *)sf.sf_uc.uc_mcontext.mc_regs); sf.sf_uc.uc_mcontext.mc_regs[R_SP] = alpha_pal_rdusp(); sf.sf_uc.uc_mcontext.mc_regs[R_ZERO] = 0xACEDBADE; /* magic number */ sf.sf_uc.uc_mcontext.mc_regs[R_PS] = frame->tf_regs[FRAME_PS]; sf.sf_uc.uc_mcontext.mc_regs[R_PC] = frame->tf_regs[FRAME_PC]; sf.sf_uc.uc_mcontext.mc_regs[R_TRAPARG_A0] = frame->tf_regs[FRAME_TRAPARG_A0]; sf.sf_uc.uc_mcontext.mc_regs[R_TRAPARG_A1] = frame->tf_regs[FRAME_TRAPARG_A1]; sf.sf_uc.uc_mcontext.mc_regs[R_TRAPARG_A2] = frame->tf_regs[FRAME_TRAPARG_A2]; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)((caddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size - rndfsize); p->p_sigstk.ss_flags |= SS_ONSTACK; } else sfp = (struct sigframe *)(alpha_pal_rdusp() - rndfsize); (void)grow_stack(p, (u_long)sfp); #ifdef DEBUG if ((sigdebug & SDB_KSTACK) && p->p_pid == sigpid) printf("sendsig(%d): sig %d ssp %p usp %p\n", p->p_pid, sig, &sf, sfp); #endif if (!useracc((caddr_t)sfp, sizeof(sf), VM_PROT_WRITE)) { #ifdef DEBUG if ((sigdebug & SDB_KSTACK) && p->p_pid == sigpid) printf("sendsig(%d): useracc failed on sig %d\n", p->p_pid, sig); #endif /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; SIGDELSET(p->p_sigignore, SIGILL); SIGDELSET(p->p_sigcatch, SIGILL); SIGDELSET(p->p_sigmask, SIGILL); psignal(p, SIGILL); return; } /* save the floating-point state, if necessary, then copy it. */ alpha_fpstate_save(p, 1); sf.sf_uc.uc_mcontext.mc_ownedfp = p->p_md.md_flags & MDP_FPUSED; bcopy(&p->p_addr->u_pcb.pcb_fp, (struct fpreg *)sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(struct fpreg)); sf.sf_uc.uc_mcontext.mc_fp_control = p->p_addr->u_pcb.pcb_fp_control; #ifdef COMPAT_OSF1 /* * XXX Create an OSF/1-style sigcontext and associated goo. */ #endif /* * copy the frame out to userland. */ (void) copyout((caddr_t)&sf, (caddr_t)sfp, sizeof(sf)); #ifdef DEBUG if (sigdebug & SDB_FOLLOW) printf("sendsig(%d): sig %d sfp %p code %lx\n", p->p_pid, sig, sfp, code); #endif /* * Set up the registers to return to sigcode. */ frame->tf_regs[FRAME_PC] = PS_STRINGS - (esigcode - sigcode); frame->tf_regs[FRAME_A0] = sig; if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { frame->tf_regs[FRAME_A1] = (u_int64_t)&(sfp->sf_si); /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void*)frame->tf_regs[FRAME_TRAPARG_A0]; } else frame->tf_regs[FRAME_A1] = code; frame->tf_regs[FRAME_A2] = (u_int64_t)&(sfp->sf_uc); frame->tf_regs[FRAME_T12] = (u_int64_t)catcher; /* t12 is pv */ alpha_pal_wrusp((unsigned long)sfp); #ifdef DEBUG if (sigdebug & SDB_FOLLOW) printf("sendsig(%d): pc %lx, catcher %lx\n", p->p_pid, frame->tf_regs[FRAME_PC], frame->tf_regs[FRAME_A3]); if ((sigdebug & SDB_KSTACK) && p->p_pid == sigpid) printf("sendsig(%d): sig %d returns\n", p->p_pid, sig); #endif } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int osigreturn(struct proc *p, struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap) { struct osigcontext *scp, ksc; scp = uap->sigcntxp; /* * Fetch the entire context structure at once for speed. */ if (copyin((caddr_t)scp, (caddr_t)&ksc, sizeof ksc)) return (EFAULT); /* * XXX - Should we do this. What if we get a "handcrafted" * but valid sigcontext that hasn't the magic number? */ if (ksc.sc_regs[R_ZERO] != 0xACEDBADE) /* magic number */ return (EINVAL); /* * Restore the user-supplied information */ if (ksc.sc_onstack) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; /* * longjmp is still implemented by calling osigreturn. The new * sigmask is stored in sc_reserved, sc_mask is only used for * backward compatibility. */ SIGSETOLD(p->p_sigmask, ksc.sc_mask); SIG_CANTMASK(p->p_sigmask); set_regs(p, (struct reg *)ksc.sc_regs); p->p_md.md_tf->tf_regs[FRAME_PC] = ksc.sc_pc; p->p_md.md_tf->tf_regs[FRAME_PS] = (ksc.sc_ps | ALPHA_PSL_USERSET) & ~ALPHA_PSL_USERCLR; alpha_pal_wrusp(ksc.sc_regs[R_SP]); /* XXX ksc.sc_ownedfp ? */ alpha_fpstate_drop(p); bcopy((struct fpreg *)ksc.sc_fpregs, &p->p_addr->u_pcb.pcb_fp, sizeof(struct fpreg)); p->p_addr->u_pcb.pcb_fp_control = ksc.sc_fp_control; return (EJUSTRETURN); } int sigreturn(struct proc *p, struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap) { ucontext_t uc, *ucp; struct pcb *pcb; unsigned long val; ucp = uap->sigcntxp; pcb = &p->p_addr->u_pcb; #ifdef DEBUG if (sigdebug & SDB_FOLLOW) printf("sigreturn: pid %d, scp %p\n", p->p_pid, ucp); #endif /* * Fetch the entire context structure at once for speed. * Note that struct osigcontext is smaller than a ucontext_t, * so even if copyin() faults, we may have actually gotten a complete * struct osigcontext. */ if (copyin((caddr_t)ucp, (caddr_t)&uc, sizeof(ucontext_t))) { if (((struct osigcontext*)&uc)->sc_regs[R_ZERO] == 0xACEDBADE) return osigreturn(p, (struct osigreturn_args *)uap); else return (EFAULT); } if (((struct osigcontext*)&uc)->sc_regs[R_ZERO] == 0xACEDBADE) return osigreturn(p, (struct osigreturn_args *)uap); /* * Restore the user-supplied information */ set_regs(p, (struct reg *)uc.uc_mcontext.mc_regs); val = (uc.uc_mcontext.mc_regs[R_PS] | ALPHA_PSL_USERSET) & ~ALPHA_PSL_USERCLR; p->p_md.md_tf->tf_regs[FRAME_PS] = val; p->p_md.md_tf->tf_regs[FRAME_PC] = uc.uc_mcontext.mc_regs[R_PC]; alpha_pal_wrusp(uc.uc_mcontext.mc_regs[R_SP]); if (uc.uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = uc.uc_sigmask; SIG_CANTMASK(p->p_sigmask); /* XXX ksc.sc_ownedfp ? */ alpha_fpstate_drop(p); bcopy((struct fpreg *)uc.uc_mcontext.mc_fpregs, &p->p_addr->u_pcb.pcb_fp, sizeof(struct fpreg)); p->p_addr->u_pcb.pcb_fp_control = uc.uc_mcontext.mc_fp_control; #ifdef DEBUG if (sigdebug & SDB_FOLLOW) printf("sigreturn(%d): returns\n", p->p_pid); #endif return (EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { /*alpha_pal_halt(); */ prom_halt(1); } /* * Clear registers on exec */ void setregs(struct proc *p, u_long entry, u_long stack, u_long ps_strings) { struct trapframe *tfp = p->p_md.md_tf; bzero(tfp->tf_regs, FRAME_SIZE * sizeof tfp->tf_regs[0]); bzero(&p->p_addr->u_pcb.pcb_fp, sizeof p->p_addr->u_pcb.pcb_fp); p->p_addr->u_pcb.pcb_fp_control = 0; p->p_addr->u_pcb.pcb_fp.fpr_cr = (FPCR_DYN_NORMAL | FPCR_INVD | FPCR_DZED | FPCR_OVFD | FPCR_INED | FPCR_UNFD); alpha_pal_wrusp(stack); tfp->tf_regs[FRAME_PS] = ALPHA_PSL_USERSET; tfp->tf_regs[FRAME_PC] = entry & ~3; tfp->tf_regs[FRAME_A0] = stack; /* a0 = sp */ tfp->tf_regs[FRAME_A1] = 0; /* a1 = rtld cleanup */ tfp->tf_regs[FRAME_A2] = 0; /* a2 = rtld object */ tfp->tf_regs[FRAME_A3] = PS_STRINGS; /* a3 = ps_strings */ tfp->tf_regs[FRAME_T12] = tfp->tf_regs[FRAME_PC]; /* a.k.a. PV */ p->p_md.md_flags &= ~MDP_FPUSED; alpha_fpstate_drop(p); } int ptrace_set_pc(struct proc *p, unsigned long addr) { struct trapframe *tp = p->p_md.md_tf; tp->tf_regs[FRAME_PC] = addr; return 0; } static int ptrace_read_int(struct proc *p, vm_offset_t addr, u_int32_t *v) { struct iovec iov; struct uio uio; iov.iov_base = (caddr_t) v; iov.iov_len = sizeof(u_int32_t); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)addr; uio.uio_resid = sizeof(u_int32_t); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_procp = p; return procfs_domem(curproc, p, NULL, &uio); } static int ptrace_write_int(struct proc *p, vm_offset_t addr, u_int32_t v) { struct iovec iov; struct uio uio; iov.iov_base = (caddr_t) &v; iov.iov_len = sizeof(u_int32_t); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = (off_t)addr; uio.uio_resid = sizeof(u_int32_t); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_procp = p; return procfs_domem(curproc, p, NULL, &uio); } static u_int64_t ptrace_read_register(struct proc *p, int regno) { static int reg_to_frame[32] = { FRAME_V0, FRAME_T0, FRAME_T1, FRAME_T2, FRAME_T3, FRAME_T4, FRAME_T5, FRAME_T6, FRAME_T7, FRAME_S0, FRAME_S1, FRAME_S2, FRAME_S3, FRAME_S4, FRAME_S5, FRAME_S6, FRAME_A0, FRAME_A1, FRAME_A2, FRAME_A3, FRAME_A4, FRAME_A5, FRAME_T8, FRAME_T9, FRAME_T10, FRAME_T11, FRAME_RA, FRAME_T12, FRAME_AT, FRAME_GP, FRAME_SP, -1, /* zero */ }; if (regno == R_ZERO) return 0; return p->p_md.md_tf->tf_regs[reg_to_frame[regno]]; } static int ptrace_clear_bpt(struct proc *p, struct mdbpt *bpt) { return ptrace_write_int(p, bpt->addr, bpt->contents); } static int ptrace_set_bpt(struct proc *p, struct mdbpt *bpt) { int error; u_int32_t bpins = 0x00000080; error = ptrace_read_int(p, bpt->addr, &bpt->contents); if (error) return error; return ptrace_write_int(p, bpt->addr, bpins); } int ptrace_clear_single_step(struct proc *p) { if (p->p_md.md_flags & MDP_STEP2) { ptrace_clear_bpt(p, &p->p_md.md_sstep[1]); ptrace_clear_bpt(p, &p->p_md.md_sstep[0]); p->p_md.md_flags &= ~MDP_STEP2; } else if (p->p_md.md_flags & MDP_STEP1) { ptrace_clear_bpt(p, &p->p_md.md_sstep[0]); p->p_md.md_flags &= ~MDP_STEP1; } return 0; } int ptrace_single_step(struct proc *p) { int error; vm_offset_t pc = p->p_md.md_tf->tf_regs[FRAME_PC]; alpha_instruction ins; vm_offset_t addr[2]; /* places to set breakpoints */ int count = 0; /* count of breakpoints */ if (p->p_md.md_flags & (MDP_STEP1|MDP_STEP2)) panic("ptrace_single_step: step breakpoints not removed"); error = ptrace_read_int(p, pc, &ins.bits); if (error) return error; switch (ins.branch_format.opcode) { case op_j: /* Jump: target is register value */ addr[0] = ptrace_read_register(p, ins.jump_format.rs) & ~3; count = 1; break; case op_br: case op_fbeq: case op_fblt: case op_fble: case op_bsr: case op_fbne: case op_fbge: case op_fbgt: case op_blbc: case op_beq: case op_blt: case op_ble: case op_blbs: case op_bne: case op_bge: case op_bgt: /* Branch: target is pc+4+4*displacement */ addr[0] = pc + 4; addr[1] = pc + 4 + 4 * ins.branch_format.displacement; count = 2; break; default: addr[0] = pc + 4; count = 1; } p->p_md.md_sstep[0].addr = addr[0]; error = ptrace_set_bpt(p, &p->p_md.md_sstep[0]); if (error) return error; if (count == 2) { p->p_md.md_sstep[1].addr = addr[1]; error = ptrace_set_bpt(p, &p->p_md.md_sstep[1]); if (error) { ptrace_clear_bpt(p, &p->p_md.md_sstep[0]); return error; } p->p_md.md_flags |= MDP_STEP2; } else p->p_md.md_flags |= MDP_STEP1; return 0; } int ptrace_read_u_check(p, addr, len) struct proc *p; vm_offset_t addr; size_t len; { vm_offset_t gap; if ((vm_offset_t) (addr + len) < addr) return EPERM; if ((vm_offset_t) (addr + len) <= sizeof(struct user)) return 0; gap = (char *) p->p_md.md_tf - (char *) p->p_addr; if ((vm_offset_t) addr < gap) return EPERM; if ((vm_offset_t) (addr + len) <= (vm_offset_t) (gap + sizeof(struct trapframe))) return 0; return EPERM; } int ptrace_write_u(struct proc *p, vm_offset_t off, long data) { vm_offset_t min; #if 0 struct trapframe frame_copy; struct trapframe *tp; #endif /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_tf - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { #if 0 tp = p->p_md.md_tf; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); #endif *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_fp); if (off >= min && off <= min + sizeof(struct fpreg) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int alpha_pa_access(vm_offset_t pa) { #if 0 int i; for (i = 0; phys_avail[i] != 0; i += 2) { if (pa < phys_avail[i]) continue; if (pa < phys_avail[i+1]) return VM_PROT_READ|VM_PROT_WRITE; } return 0; #else return VM_PROT_READ|VM_PROT_WRITE; #endif } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb = &p->p_addr->u_pcb; struct trapframe *tp = p->p_md.md_tf; tp = p->p_md.md_tf; #define C(r) regs->r_regs[R_ ## r] = tp->tf_regs[FRAME_ ## r] C(V0); C(T0); C(T1); C(T2); C(T3); C(T4); C(T5); C(T6); C(T7); C(S0); C(S1); C(S2); C(S3); C(S4); C(S5); C(S6); C(A0); C(A1); C(A2); C(A3); C(A4); C(A5); C(T8); C(T9); C(T10); C(T11); C(RA); C(T12); C(AT); C(GP); #undef C regs->r_regs[R_ZERO] = tp->tf_regs[FRAME_PC]; regs->r_regs[R_SP] = pcb->pcb_hw.apcb_usp; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb = &p->p_addr->u_pcb; struct trapframe *tp = p->p_md.md_tf; tp = p->p_md.md_tf; #define C(r) tp->tf_regs[FRAME_ ## r] = regs->r_regs[R_ ## r] C(V0); C(T0); C(T1); C(T2); C(T3); C(T4); C(T5); C(T6); C(T7); C(S0); C(S1); C(S2); C(S3); C(S4); C(S5); C(S6); C(A0); C(A1); C(A2); C(A3); C(A4); C(A5); C(T8); C(T9); C(T10); C(T11); C(RA); C(T12); C(AT); C(GP); #undef C tp->tf_regs[FRAME_PC] = regs->r_regs[R_ZERO]; pcb->pcb_hw.apcb_usp = regs->r_regs[R_SP]; return (0); } int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { alpha_fpstate_save(p, 0); bcopy(&p->p_addr->u_pcb.pcb_fp, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { alpha_fpstate_drop(p); bcopy(fpregs, &p->p_addr->u_pcb.pcb_fp, sizeof *fpregs); return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { #if 0 struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; #endif return(-1); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); void alpha_fpstate_check(struct proc *p) { if (p->p_addr->u_pcb.pcb_hw.apcb_flags & ALPHA_PCB_FLAGS_FEN) if (p != fpcurproc) panic("alpha_check_fpcurproc: bogus"); } #define SET_FEN(p) \ (p)->p_addr->u_pcb.pcb_hw.apcb_flags |= ALPHA_PCB_FLAGS_FEN #define CLEAR_FEN(p) \ (p)->p_addr->u_pcb.pcb_hw.apcb_flags &= ~ALPHA_PCB_FLAGS_FEN /* * Save the floating point state in the pcb. Use this to get read-only * access to the floating point state. If write is true, the current * fp process is cleared so that fp state can safely be modified. The * process will automatically reload the changed state by generating a * FEN trap. */ void alpha_fpstate_save(struct proc *p, int write) { if (p == fpcurproc) { /* * If curproc != fpcurproc, then we need to enable FEN * so that we can dump the fp state. */ alpha_pal_wrfen(1); /* * Save the state in the pcb. */ savefpstate(&p->p_addr->u_pcb.pcb_fp); if (write) { /* * If fpcurproc == curproc, just ask the * PALcode to disable FEN, otherwise we must * clear the FEN bit in fpcurproc's pcb. */ if (fpcurproc == curproc) alpha_pal_wrfen(0); else CLEAR_FEN(fpcurproc); fpcurproc = NULL; } else { /* * Make sure that we leave FEN enabled if * curproc == fpcurproc. We must have at most * one process with FEN enabled. Note that FEN * must already be set in fpcurproc's pcb. */ if (curproc != fpcurproc) alpha_pal_wrfen(0); } } } /* * Relinquish ownership of the FP state. This is called instead of * alpha_save_fpstate() if the entire FP state is being changed * (e.g. on sigreturn). */ void alpha_fpstate_drop(struct proc *p) { if (p == fpcurproc) { if (p == curproc) { /* * Disable FEN via the PALcode. This will * clear the bit in the pcb as well. */ alpha_pal_wrfen(0); } else { /* * Clear the FEN bit of the pcb. */ CLEAR_FEN(p); } fpcurproc = NULL; } } /* * Switch the current owner of the fp state to p, reloading the state * from the pcb. */ void alpha_fpstate_switch(struct proc *p) { /* * Enable FEN so that we can access the fp registers. */ alpha_pal_wrfen(1); if (fpcurproc) { /* * Dump the old fp state if its valid. */ savefpstate(&fpcurproc->p_addr->u_pcb.pcb_fp); CLEAR_FEN(fpcurproc); } /* * Remember the new FP owner and reload its state. */ fpcurproc = p; restorefpstate(&fpcurproc->p_addr->u_pcb.pcb_fp); /* * If the new owner is curproc, leave FEN enabled, otherwise * mark its PCB so that it gets FEN when we context switch to * it later. */ if (p != curproc) { alpha_pal_wrfen(0); SET_FEN(p); } p->p_md.md_flags |= MDP_FPUSED; } Index: stable/4/sys/i386/i386/machdep.c =================================================================== --- stable/4/sys/i386/i386/machdep.c (revision 118739) +++ stable/4/sys/i386/i386/machdep.c (revision 118740) @@ -1,2612 +1,2613 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 * $FreeBSD$ */ #include "apm.h" #include "ether.h" #include "npx.h" #include "opt_atalk.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_directio.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_perfmon.h" #include "opt_swap.h" #include "opt_user_ldt.h" #include "opt_userconfig.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #ifdef SMP #include #include #endif #ifdef PERFMON #include #endif #include #ifdef OLD_BUS_ARCH #include #endif #include #include #include #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); static void cpu_startup __P((void *)); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm __P((struct save87 *, struct savexmm *)); static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *)); #endif /* CPU_ENABLE_SSE */ #ifdef DIRECTIO extern void ffs_rawread_setup(void); #endif /* DIRECTIO */ SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "IU", ""); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "IU", ""); static int sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); static int sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS) { int error; /* Unwind the buffer, so that it's linear (possibly starting with * some initial nulls). */ error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr, msgbufp->msg_size-msgbufp->msg_bufr,req); if(error) return(error); if(msgbufp->msg_bufr>0) { error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr, msgbufp->msg_bufr,req); } return(error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer"); static int msgbuf_clear; static int sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { /* Clear the buffer and reset write pointer */ bzero(msgbufp->msg_ptr,msgbufp->msg_size); msgbufp->msg_bufr=msgbufp->msg_bufx=0; msgbuf_clear=0; } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW, &msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I", "Clear kernel message buffer"); int bootverbose = 0, Maxmem = 0; long dumplo; vm_paddr_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; static struct trapframe proc0_tf; static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf("%s", version); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %llu (%lluK bytes)\n", ptoa((u_int64_t)Maxmem), ptoa((u_int64_t)Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size1; size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%09llx - 0x%09llx, %llu bytes (%llu pages)\n", (u_int64_t)phys_avail[indx], (u_int64_t)phys_avail[indx + 1] - 1, (u_int64_t)size1, (u_int64_t)size1 / PAGE_SIZE); } } /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/20 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; int kbytes = physmem * (PAGE_SIZE / 1024); nbuf = 50; if (kbytes > 4096) nbuf += min((kbytes - 4096) / factor, 65536 / factor); if (kbytes > 65536) nbuf += (kbytes - 65536) * 2 / (factor * 5); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; } /* * Do not allow the buffer_map to be more then 1/2 the size of the * kernel_map. */ if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2)) { nbuf = (kernel_map->max_offset - kernel_map->min_offset) / (BKVASIZE * 2); printf("Warning: nbufs capped at %d\n", nbuf); } nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); v = bufhashinit(v); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size); mb_map->system_map = 1; + mbutltop = mbutl; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i]); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %llu (%lluK bytes)\n", ptoa((u_int64_t)cnt.v_free_count), ptoa((u_int64_t)cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the APs and APICs */ mp_announce(); #endif /* SMP */ cpu_setregs(); } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } int unregister_netisr(num) int num; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("unregister_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = NULL; return (0); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) { register struct proc *p = curproc; register struct trapframe *regs; register struct osigframe *fp; struct osigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = (p->p_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct osigframe)); p->p_sigstk.ss_flags |= SS_ONSTACK; } else fp = (struct osigframe *)regs->tf_esp - 1; /* Translate the signal if appropriate */ if (p->p_sysent->sv_sigtbl) { if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; } /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } /* save scratch registers */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by sigreturn. */ sf.sf_siginfo.si_sc.sc_onstack = oonstack; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* see sendsig for comment */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct osigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - szosigcode; regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } void sendsig(catcher, sig, mask, code) sig_t catcher; int sig; sigset_t *mask; u_long code; { struct proc *p = curproc; struct trapframe *regs; struct sigacts *psp = p->p_sigacts; struct sigframe sf, *sfp; int oonstack; if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, sig, mask, code); return; } regs = p->p_md.md_regs; oonstack = (p->p_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; /* save user context */ bzero(&sf, sizeof(struct sigframe)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = p->p_sigstk; sf.sf_uc.uc_mcontext.mc_onstack = oonstack; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(struct trapframe)); /* Allocate and validate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct sigframe)); p->p_sigstk.ss_flags |= SS_ONSTACK; } else sfp = (struct sigframe *)regs->tf_esp - 1; /* Translate the signal is appropriate */ if (p->p_sysent->sv_sigtbl) { if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; } /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* fill siginfo structure */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = code; sf.sf_si.si_addr = (void*)regs->tf_err; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = code; sf.sf_addr = regs->tf_err; sf.sf_ahu.sf_handler = catcher; } /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_eflags &= ~PSL_T; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) int osigreturn(p, uap) struct proc *p; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { register struct osigcontext *scp; register struct trapframe *regs = p->p_md.md_regs; int eflags; scp = uap->sigcntxp; if (!useracc((caddr_t)scp, sizeof (struct osigcontext), VM_PROT_READ)) return(EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* restore scratch registers */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; if (scp->sc_onstack & 01) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; SIGSETOLD(p->p_sigmask, scp->sc_mask); SIG_CANTMASK(p->p_sigmask); regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return(EJUSTRETURN); } int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { ucontext_t *sigcntxp; } */ *uap; { struct trapframe *regs; ucontext_t *ucp; int cs, eflags; ucp = uap->sigcntxp; if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ)) return (EFAULT); if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516) return (osigreturn(p, (struct osigreturn_args *)uap)); /* * Since ucp is not an osigcontext but a ucontext_t, we have to * check again if all of it is accessible. A ucontext_t is * much larger, so instead of just checking for the pointer * being valid for the size of an osigcontext, now check for * it being valid for a whole, new-style ucontext_t. */ if (!useracc((caddr_t)ucp, sizeof(ucontext_t), VM_PROT_READ)) return (EFAULT); regs = p->p_md.md_regs; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { printf("sigreturn: eflags = 0x%x\n", eflags); return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { printf("sigreturn: cs = 0x%x\n", cs); trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(struct trapframe)); } if (ucp->uc_mcontext.mc_onstack & 1) p->p_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = ucp->uc_sigmask; SIG_CANTMASK(p->p_sigmask); return(EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Hook to idle the CPU when possible. This is disabled by default for * the SMP case as there is a small window of opportunity whereby a ready * process is delayed to the next clock tick. It should be safe to enable * for SMP if power is a concern. * * On -stable, cpu_idle() is called with interrupts disabled and must * return with them enabled. */ #ifdef SMP static int cpu_idle_hlt = 0; #else static int cpu_idle_hlt = 1; #endif SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, &cpu_idle_hlt, 0, "Idle loop HLT enable"); void cpu_idle(void) { if (cpu_idle_hlt) { /* * We must guarentee that hlt is exactly the instruction * following the sti. */ __asm __volatile("sti; hlt"); } else { __asm __volatile("sti"); } } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_md.md_regs; struct pcb *pcb = &p->p_addr->u_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); #ifdef USER_LDT /* was i386_user_cleanup() in NetBSD */ user_ldt_free(pcb); #endif bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #if NNPX > 0 /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); cr0 |= CR0_NE; /* Done by npxinit() */ cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ #ifdef I386_CPU if (cpu_class != CPUCLASS_386) #endif cr0 |= CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); extern u_long bootdev; /* not a dev_t - encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Boot device (not in dev_t format)"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif #ifndef SMP extern struct segment_descriptor common_tssd, *tss_gdt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { 0x400, /* segment base address */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 9 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */ { 0, /* segment base address (overwritten) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; int hasbrokenint12; u_int basemem, extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t *pte; const char *cp; struct { u_int64_t base; u_int64_t length; u_int32_t type; } *smap; hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); bzero(&vmf, sizeof(struct vm86frame)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Some newer BIOSes has broken INT 12H implementation which cause * kernel panic immediately. In this case, we need to scan SMAP * with INT 15:E820 first, then determine base memory size. */ if (hasbrokenint12) { goto int15e820; } /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; int15e820: /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ #define SMAPSIZ sizeof(*smap) #define SMAP_SIG 0x534D4150 /* 'SMAP' */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = SMAPSIZ; i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; #ifndef PAE if (smap->base >= 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(smap->length / 1024)); goto next_run; } #endif for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); /* * Perform "base memory" related probes & setup based on SMAP */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0) { basemem = 640; } if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } if (physmap[1] != 0) goto physmap_done; /* * If we failed above, try memory map with INT 15:E801 */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory. */ extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ mp_probe(); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif /* * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. This overrides MAXMEM. */ if ((cp = getenv("hw.physmem")) != NULL) { u_int64_t AllowMem, sanity; char *ep; sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Ignoring invalid memory size of '%s'\n", cp); else Maxmem = atop(AllowMem); } if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %uK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; pte = vtopte(KERNBASE + PAGE_SIZE); /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; volatile int *ptr = (int *)(KERNBASE + PAGE_SIZE); /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *ptr; /* * Test for alternating 1's and 0's */ *ptr = 0xaaaaaaaa; if (*ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *ptr = 0x55555555; if (*ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *ptr = 0xffffffff; if (*ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *ptr = 0x0; if (*ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, off, x; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif /* * Prevent lowering of the ipl if we call tsleep() early. */ safepri = cpl; proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* Init basic tunables, hz etc */ init_param1(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = atop(sizeof(struct privatespace) - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0]; #else gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPROC0_SEL].ssd_base = (int) &common_tss; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* make ldt memory segments */ /* * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1); for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); #ifdef USER_LDT currentldt = _default_ldt; #endif /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; common_tssd = *tss_gdt; common_tss.tss_ioopt = (sizeof common_tss) << 16; ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); #ifdef PAE dblfault_tss.tss_cr3 = (int)IdlePDPT - KERNBASE; #else dblfault_tss.tss_cr3 = (int)IdlePTD; #endif dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); pmap_bootstrap(first, 0); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; #ifdef PAE proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePDPT - KERNBASE; #else proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; #endif #ifdef SMP proc0.p_addr->u_pcb.pcb_mpnest = 1; #endif proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_md.md_regs->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_md.md_regs->tf_eflags |= PSL_T; return (0); } int ptrace_read_u_check(p, addr, len) struct proc *p; vm_offset_t addr; size_t len; { vm_offset_t gap; if ((vm_offset_t) (addr + len) < addr) return EPERM; if ((vm_offset_t) (addr + len) <= sizeof(struct user)) return 0; gap = (char *) p->p_md.md_regs - (char *) p->p_addr; if ((vm_offset_t) addr < gap) return EPERM; if ((vm_offset_t) (addr + len) <= (vm_offset_t) (gap + sizeof(struct trapframe))) return 0; return EPERM; } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; long data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = p->p_md.md_regs; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFL_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_save); if (off >= min && off <= min + sizeof(union savefpu) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } #ifdef CPU_ENABLE_SSE static void fill_fpregs_xmm(sv_xmm, sv_87) struct savexmm *sv_xmm; struct save87 *sv_87; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_tw = penv_xmm->en_tw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; } static void set_fpregs_xmm(sv_87, sv_xmm) struct save87 *sv_87; struct savexmm *sv_xmm; { register struct env87 *penv_87 = &sv_87->sv_env; register struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_tw = penv_87->en_tw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers */ for (i = 0; i < 8; ++i) sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; } #endif /* CPU_ENABLE_SSE */ int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm, (struct save87 *)fpregs); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { #ifdef CPU_ENABLE_SSE if (cpu_fxsr) { set_fpregs_xmm((struct save87 *)fpregs, &p->p_addr->u_pcb.pcb_save.sv_xmm); return (0); } #endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs); return (0); } int fill_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; if (p == NULL) { dbregs->dr0 = rdr0(); dbregs->dr1 = rdr1(); dbregs->dr2 = rdr2(); dbregs->dr3 = rdr3(); dbregs->dr4 = rdr4(); dbregs->dr5 = rdr5(); dbregs->dr6 = rdr6(); dbregs->dr7 = rdr7(); } else { pcb = &p->p_addr->u_pcb; dbregs->dr0 = pcb->pcb_dr0; dbregs->dr1 = pcb->pcb_dr1; dbregs->dr2 = pcb->pcb_dr2; dbregs->dr3 = pcb->pcb_dr3; dbregs->dr4 = 0; dbregs->dr5 = 0; dbregs->dr6 = pcb->pcb_dr6; dbregs->dr7 = pcb->pcb_dr7; } return (0); } int set_dbregs(p, dbregs) struct proc *p; struct dbreg *dbregs; { struct pcb *pcb; int i; u_int32_t mask1, mask2; if (p == NULL) { load_dr0(dbregs->dr0); load_dr1(dbregs->dr1); load_dr2(dbregs->dr2); load_dr3(dbregs->dr3); load_dr4(dbregs->dr4); load_dr5(dbregs->dr5); load_dr6(dbregs->dr6); load_dr7(dbregs->dr7); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; i++, mask1 <<= 2, mask2 <<= 2) if ((dbregs->dr7 & mask1) == mask2) return (EINVAL); pcb = &p->p_addr->u_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space, unless, perhaps, we were called by * uid 0. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (suser(p) != 0) { if (dbregs->dr7 & 0x3) { /* dr0 is enabled */ if (dbregs->dr0 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<2)) { /* dr1 is enabled */ if (dbregs->dr1 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<4)) { /* dr2 is enabled */ if (dbregs->dr2 >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (dbregs->dr7 & (0x3<<6)) { /* dr3 is enabled */ if (dbregs->dr3 >= VM_MAXUSER_ADDRESS) return (EINVAL); } } pcb->pcb_dr0 = dbregs->dr0; pcb->pcb_dr1 = dbregs->dr1; pcb->pcb_dr2 = dbregs->dr2; pcb->pcb_dr3 = dbregs->dr3; pcb->pcb_dr6 = dbregs->dr6; pcb->pcb_dr7 = dbregs->dr7; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i=0; i /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ Index: stable/4/sys/kern/uipc_mbuf.c =================================================================== --- stable/4/sys/kern/uipc_mbuf.c (revision 118739) +++ stable/4/sys/kern/uipc_mbuf.c (revision 118740) @@ -1,1631 +1,1635 @@ /* * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 * $FreeBSD$ */ #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef INVARIANTS #include #endif static void mbinit __P((void *)); SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL) struct mbuf *mbutl; +struct mbuf *mbutltop; char *mclrefcnt; struct mbstat mbstat; u_long mbtypes[MT_NTYPES]; struct mbuf *mmbfree; union mcluster *mclfree; int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif int m_clreflimithits; int nmbclusters; int nmbufs; u_int m_mballoc_wid = 0; u_int m_clalloc_wid = 0; SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, &max_linkhdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, &max_protohdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, &max_datalen, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0, ""); SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, ""); SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes, sizeof(mbtypes), "LU", ""); SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, "Maximum number of mbuf clusters available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, "Maximum number of mbufs available"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif SYSCTL_INT(_kern_ipc, OID_AUTO, m_clreflimithits, CTLFLAG_RD, &m_clreflimithits, 0, ""); static void m_reclaim __P((void)); static struct mbuf *m_clreflimit(struct mbuf *m0, int how); #ifndef NMBCLUSTERS #define NMBCLUSTERS (512 + maxusers * 16) #endif #ifndef NMBUFS #define NMBUFS (nmbclusters * 4) #endif /* * Perform sanity checks of tunables declared above. */ static void tunable_mbinit(void *dummy) { /* * This has to be done before VM init. */ nmbclusters = NMBCLUSTERS; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); nmbufs = NMBUFS; TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); /* Sanity checks */ if (nmbufs < nmbclusters * 2) nmbufs = nmbclusters * 2; return; } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); /* "number of clusters of pages" */ #define NCL_INIT 1 #define NMB_INIT 16 /* ARGSUSED*/ static void mbinit(dummy) void *dummy; { int s; mmbfree = NULL; mclfree = NULL; mbstat.m_msize = MSIZE; mbstat.m_mclbytes = MCLBYTES; mbstat.m_minclsize = MINCLSIZE; mbstat.m_mlen = MLEN; mbstat.m_mhlen = MHLEN; s = splimp(); if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0) goto bad; #if MCLBYTES <= PAGE_SIZE if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0) goto bad; #else /* It's OK to call contigmalloc in this context. */ if (m_clalloc(16, M_WAIT) == 0) goto bad; #endif splx(s); return; bad: panic("mbinit"); } /* * Allocate at least nmb mbufs and place on mbuf free list. * Must be called at splimp. */ /* ARGSUSED */ int m_mballoc(nmb, how) register int nmb; int how; { register caddr_t p; register int i; int nbytes; /* * If we've hit the mbuf limit, stop allocating from mb_map, * (or trying to) in order to avoid dipping into the section of * mb_map which we've "reserved" for clusters. */ if ((nmb + mbstat.m_mbufs) > nmbufs) return (0); /* * Once we run out of map space, it will be impossible to get * any more (nothing is ever freed back to the map) * -- however you are not dead as m_reclaim might * still be able to free a substantial amount of space. * * XXX Furthermore, we can also work with "recycled" mbufs (when * we're calling with M_WAIT the sleep procedure will be woken * up when an mbuf is freed. See m_mballoc_wait()). */ if (mb_map_full) return (0); nbytes = round_page(nmb * MSIZE); p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT); if (p == 0 && how == M_WAIT) { mbstat.m_wait++; p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK); } /* * Either the map is now full, or `how' is M_NOWAIT and there * are no pages left. */ if (p == NULL) return (0); + mbutltop += nbytes; nmb = nbytes / MSIZE; for (i = 0; i < nmb; i++) { ((struct mbuf *)p)->m_next = mmbfree; mmbfree = (struct mbuf *)p; p += MSIZE; } mbstat.m_mbufs += nmb; mbtypes[MT_FREE] += nmb; return (1); } /* * Once the mb_map has been exhausted and if the call to the allocation macros * (or, in some cases, functions) is with M_WAIT, then it is necessary to rely * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a * designated (mbuf_wait) time. */ struct mbuf * m_mballoc_wait(int caller, int type) { struct mbuf *p; int s; s = splimp(); m_mballoc_wid++; if ((tsleep(&m_mballoc_wid, PVM, "mballc", mbuf_wait)) == EWOULDBLOCK) m_mballoc_wid--; splx(s); /* * Now that we (think) that we've got something, we will redo an * MGET, but avoid getting into another instance of m_mballoc_wait() * XXX: We retry to fetch _even_ if the sleep timed out. This is left * this way, purposely, in the [unlikely] case that an mbuf was * freed but the sleep was not awakened in time. */ p = NULL; switch (caller) { case MGET_C: MGET(p, M_DONTWAIT, type); break; case MGETHDR_C: MGETHDR(p, M_DONTWAIT, type); break; default: panic("m_mballoc_wait: invalid caller (%d)", caller); } s = splimp(); if (p != NULL) { /* We waited and got something... */ mbstat.m_wait++; /* Wake up another if we have more free. */ if (mmbfree != NULL) MMBWAKEUP(); } splx(s); return (p); } #if MCLBYTES > PAGE_SIZE static int i_want_my_mcl; static void kproc_mclalloc(void) { int status; while (1) { tsleep(&i_want_my_mcl, PVM, "mclalloc", 0); for (; i_want_my_mcl; i_want_my_mcl--) { if (m_clalloc(1, M_WAIT) == 0) printf("m_clalloc failed even in process context!\n"); } } } static struct proc *mclallocproc; static struct kproc_desc mclalloc_kp = { "mclalloc", kproc_mclalloc, &mclallocproc }; SYSINIT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &mclalloc_kp); #endif /* * Allocate some number of mbuf clusters * and place on cluster free list. * Must be called at splimp. */ /* ARGSUSED */ int m_clalloc(ncl, how) register int ncl; int how; { register caddr_t p; register int i; int npg; /* * If we've hit the mcluster number limit, stop allocating from * mb_map, (or trying to) in order to avoid dipping into the section * of mb_map which we've "reserved" for mbufs. */ if ((ncl + mbstat.m_clusters) > nmbclusters) goto m_clalloc_fail; /* * Once we run out of map space, it will be impossible * to get any more (nothing is ever freed back to the * map). From this point on, we solely rely on freed * mclusters. */ if (mb_map_full) goto m_clalloc_fail; #if MCLBYTES > PAGE_SIZE if (how != M_WAIT) { i_want_my_mcl += ncl; wakeup(&i_want_my_mcl); mbstat.m_wait++; p = 0; } else { p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul, ~0ul, PAGE_SIZE, 0, mb_map); } #else npg = ncl; p = (caddr_t)kmem_malloc(mb_map, ctob(npg), how != M_WAIT ? M_NOWAIT : M_WAITOK); ncl = ncl * PAGE_SIZE / MCLBYTES; #endif /* * Either the map is now full, or `how' is M_NOWAIT and there * are no pages left. */ if (p == NULL) { static int last_report ; /* when we did that (in ticks) */ m_clalloc_fail: mbstat.m_drops++; if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf("All mbuf clusters exhausted, please see tuning(7).\n"); } return (0); } + + mbutltop += ctob(npg); for (i = 0; i < ncl; i++) { ((union mcluster *)p)->mcl_next = mclfree; mclfree = (union mcluster *)p; p += MCLBYTES; mbstat.m_clfree++; } mbstat.m_clusters += ncl; return (1); } /* * Once the mb_map submap has been exhausted and the allocation is called with * M_WAIT, we rely on the mclfree union pointers. If nothing is free, we will * sleep for a designated amount of time (mbuf_wait) or until we're woken up * due to sudden mcluster availability. */ caddr_t m_clalloc_wait(void) { caddr_t p; int s; #ifdef __i386__ /* If in interrupt context, and INVARIANTS, maintain sanity and die. */ KASSERT(intr_nesting_level == 0, ("CLALLOC: CANNOT WAIT IN INTERRUPT")); #endif /* Sleep until something's available or until we expire. */ m_clalloc_wid++; if ((tsleep(&m_clalloc_wid, PVM, "mclalc", mbuf_wait)) == EWOULDBLOCK) m_clalloc_wid--; /* * Now that we (think) that we've got something, we will redo and * MGET, but avoid getting into another instance of m_clalloc_wait() */ p = NULL; MCLALLOC(p, M_DONTWAIT); s = splimp(); if (p != NULL) { /* We waited and got something... */ mbstat.m_wait++; /* Wake up another if we have more free. */ if (mclfree != NULL) MCLWAKEUP(); } splx(s); return (p); } /* * When MGET fails, ask protocols to free space when short of memory, * then re-attempt to allocate an mbuf. */ struct mbuf * m_retry(i, t) int i, t; { register struct mbuf *m; /* * Must only do the reclaim if not in an interrupt context. */ if (i == M_WAIT) { #ifdef __i386__ KASSERT(intr_nesting_level == 0, ("MBALLOC: CANNOT WAIT IN INTERRUPT")); #endif m_reclaim(); } /* * Both m_mballoc_wait and m_retry must be nulled because * when the MGET macro is run from here, we deffinately do _not_ * want to enter an instance of m_mballoc_wait() or m_retry() (again!) */ #define m_mballoc_wait(caller,type) (struct mbuf *)0 #define m_retry(i, t) (struct mbuf *)0 MGET(m, i, t); #undef m_retry #undef m_mballoc_wait if (m != NULL) mbstat.m_wait++; else { static int last_report ; /* when we did that (in ticks) */ mbstat.m_drops++; if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf("All mbufs exhausted, please see tuning(7).\n"); } } return (m); } /* * As above; retry an MGETHDR. */ struct mbuf * m_retryhdr(i, t) int i, t; { register struct mbuf *m; /* * Must only do the reclaim if not in an interrupt context. */ if (i == M_WAIT) { #ifdef __i386__ KASSERT(intr_nesting_level == 0, ("MBALLOC: CANNOT WAIT IN INTERRUPT")); #endif m_reclaim(); } #define m_mballoc_wait(caller,type) (struct mbuf *)0 #define m_retryhdr(i, t) (struct mbuf *)0 MGETHDR(m, i, t); #undef m_retryhdr #undef m_mballoc_wait if (m != NULL) mbstat.m_wait++; else { static int last_report ; /* when we did that (in ticks) */ mbstat.m_drops++; if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf("All mbufs exhausted, please see tuning(7).\n"); } } return (m); } static void m_reclaim() { register struct domain *dp; register struct protosw *pr; int s = splimp(); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain) (*pr->pr_drain)(); splx(s); mbstat.m_drain++; } /* * Space allocation routines. * These are also available as macros * for critical paths. */ struct mbuf * m_get(how, type) int how, type; { register struct mbuf *m; MGET(m, how, type); return (m); } struct mbuf * m_gethdr(how, type) int how, type; { register struct mbuf *m; MGETHDR(m, how, type); return (m); } struct mbuf * m_getclr(how, type) int how, type; { register struct mbuf *m; MGET(m, how, type); if (m == 0) return (0); bzero(mtod(m, caddr_t), MLEN); return (m); } /* * m_getcl() returns an mbuf with an attached cluster. * Because many network drivers use this kind of buffers a lot, it is * convenient to keep a small pool of free buffers of this kind. * Even a small size such as 10 gives about 10% improvement in the * forwarding rate in a bridge or router. * The size of this free list is controlled by the sysctl variable * mcl_pool_max. The list is populated on m_freem(), and used in * m_getcl() if elements are available. */ static struct mbuf *mcl_pool; static int mcl_pool_now; static int mcl_pool_max = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_max, CTLFLAG_RW, &mcl_pool_max, 0, "Maximum number of mbufs+cluster in free list"); SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_now, CTLFLAG_RD, &mcl_pool_now, 0, "Current number of mbufs+cluster in free list"); struct mbuf * m_getcl(int how, short type, int flags) { int s = splimp(); struct mbuf *mp; if (flags & M_PKTHDR) { if (type == MT_DATA && mcl_pool) { mp = mcl_pool; mcl_pool = mp->m_nextpkt; mcl_pool_now--; splx(s); mp->m_nextpkt = NULL; mp->m_data = mp->m_ext.ext_buf; mp->m_flags = M_PKTHDR|M_EXT; mp->m_pkthdr.rcvif = NULL; mp->m_pkthdr.csum_flags = 0; return mp; } else MGETHDR(mp, how, type); } else MGET(mp, how, type); if (mp) { MCLGET(mp, how); if ( (mp->m_flags & M_EXT) == 0) { m_free(mp); mp = NULL; } } splx(s); return mp; } /* * struct mbuf * * m_getm(m, len, how, type) * * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If m is * non-null, then we assume that it is a single mbuf or an mbuf chain to * which we want len bytes worth of mbufs and/or clusters attached, and so * if we succeed in allocating it, we will just return a pointer to m. * * If we happen to fail at any point during the allocation, we will free * up everything we have already allocated and return NULL. * */ struct mbuf * m_getm(struct mbuf *m, int len, int how, int type) { struct mbuf *top, *tail, *mp, *mtail = NULL; KASSERT(len >= 0, ("len is < 0 in m_getm")); MGET(mp, how, type); if (mp == NULL) return (NULL); else if (len > MINCLSIZE) { MCLGET(mp, how); if ((mp->m_flags & M_EXT) == 0) { m_free(mp); return (NULL); } } mp->m_len = 0; len -= M_TRAILINGSPACE(mp); if (m != NULL) for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next); else m = mp; top = tail = mp; while (len > 0) { MGET(mp, how, type); if (mp == NULL) goto failed; tail->m_next = mp; tail = mp; if (len > MINCLSIZE) { MCLGET(mp, how); if ((mp->m_flags & M_EXT) == 0) goto failed; } mp->m_len = 0; len -= M_TRAILINGSPACE(mp); } if (mtail != NULL) mtail->m_next = top; return (m); failed: m_freem(top); return (NULL); } /* * MFREE(struct mbuf *m, struct mbuf *n) * Free a single mbuf and associated external storage. * Place the successor, if any, in n. * * we do need to check non-first mbuf for m_aux, since some of existing * code does not call M_PREPEND properly. * (example: call to bpf_mtap from drivers) */ #define MFREE(m, n) MBUFLOCK( \ struct mbuf *_mm = (m); \ \ KASSERT(_mm->m_type != MT_FREE, ("freeing free mbuf")); \ mbtypes[_mm->m_type]--; \ if ((_mm->m_flags & M_PKTHDR) != 0) \ m_tag_delete_chain(_mm, NULL); \ if (_mm->m_flags & M_EXT) \ MEXTFREE1(m); \ (n) = _mm->m_next; \ _mm->m_type = MT_FREE; \ mbtypes[MT_FREE]++; \ _mm->m_next = mmbfree; \ mmbfree = _mm; \ MMBWAKEUP(); \ ) struct mbuf * m_free(m) struct mbuf *m; { register struct mbuf *n; MFREE(m, n); return (n); } void m_freem(m) struct mbuf *m; { int s = splimp(); /* * Try to keep a small pool of mbuf+cluster for quick use in * device drivers. A good candidate is a M_PKTHDR buffer with * only one cluster attached. Other mbufs, or those exceeding * the pool size, are just m_free'd in the usual way. * The following code makes sure that m_next, m_type, * m_pkthdr.aux and m_ext.* are properly initialized. * Other fields in the mbuf are initialized in m_getcl() * upon allocation. */ if (mcl_pool_now < mcl_pool_max && m && m->m_next == NULL && (m->m_flags & (M_PKTHDR|M_EXT)) == (M_PKTHDR|M_EXT) && m->m_type == MT_DATA && M_EXT_WRITABLE(m) ) { m_tag_delete_chain(m, NULL); m->m_nextpkt = mcl_pool; mcl_pool = m; mcl_pool_now++; } else { while (m) m = m_free(m); } splx(s); } /* * Mbuffer utility routines. */ /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(m, len, how) register struct mbuf *m; int len, how; { struct mbuf *mn; if (m->m_flags & M_PKTHDR) MGETHDR(mn, how, m->m_type); else MGET(mn, how, m->m_type); if (mn == (struct mbuf *)NULL) { m_freem(m); return ((struct mbuf *)NULL); } if (m->m_flags & M_PKTHDR) M_MOVE_PKTHDR(mn, m); mn->m_next = m; m = mn; if (len < MHLEN) MH_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ #define MCFail (mbstat.m_mcfail) struct mbuf * m_copym(m, off0, len, wait) register struct mbuf *m; int off0, wait; register int len; { register struct mbuf *n, **np; register int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = 0; while (len > 0) { if (m == 0) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) MGETHDR(n, wait, m->m_type); else MGET(n, wait, m->m_type); *np = n; if (n == 0) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; if (m->m_ext.ext_ref == NULL) { atomic_add_char( &mclrefcnt[mtocl(m->m_ext.ext_buf)], 1); } else { int s = splimp(); (*m->m_ext.ext_ref)(m->m_ext.ext_buf, m->m_ext.ext_size); splx(s); } n->m_ext = m->m_ext; n->m_flags |= M_EXT; } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (unsigned)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } top = m_clreflimit(top, wait); if (top == 0) MCFail++; return (top); nospace: m_freem(top); MCFail++; return (0); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies also have the room available. */ struct mbuf * m_copypacket(m, how) struct mbuf *m; int how; { struct mbuf *top, *n, *o; MGET(n, how, m->m_type); top = n; if (!n) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; if (m->m_ext.ext_ref == NULL) atomic_add_char(&mclrefcnt[mtocl(m->m_ext.ext_buf)], 1); else { int s = splimp(); (*m->m_ext.ext_ref)(m->m_ext.ext_buf, m->m_ext.ext_size); splx(s); } n->m_ext = m->m_ext; n->m_flags |= M_EXT; } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { MGET(o, how, m->m_type); if (!o) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; if (m->m_ext.ext_ref == NULL) { atomic_add_char( &mclrefcnt[mtocl(m->m_ext.ext_buf)], 1); } else { int s = splimp(); (*m->m_ext.ext_ref)(m->m_ext.ext_buf, m->m_ext.ext_size); splx(s); } n->m_ext = m->m_ext; n->m_flags |= M_EXT; } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } top = m_clreflimit(top, how); return top; nospace: m_freem(top); MCFail++; return 0; } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(m, off, len, cp) register struct mbuf *m; register int off; register int len; caddr_t cp; { register unsigned count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(m, how) struct mbuf *m; int how; { struct mbuf **p, *top = NULL; int remain, moff, nsize; /* Sanity check */ if (m == NULL) return (0); KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__)); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ MGET(n, how, m->m_type); if (n == NULL) goto nospace; if (top == NULL) { /* first one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) goto nospace; nsize = MHLEN; } else /* not the first one */ nsize = MLEN; if (remain >= MINCLSIZE) { MCLGET(n, how); if ((n->m_flags & M_EXT) == 0) { (void)m_free(n); goto nospace; } nsize = MCLBYTES; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __FUNCTION__)); } return (top); nospace: m_freem(top); MCFail++; return (0); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(m, n) register struct mbuf *m, *n; { while (m->m_next) m = m->m_next; while (n) { if (m->m_flags & M_EXT || m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } void m_adj(mp, req_len) struct mbuf *mp; int req_len; { register int len = req_len; register struct mbuf *m; register int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } m = mp; if (mp->m_flags & M_PKTHDR) m->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; break; } count -= m->m_len; } while (m->m_next) (m = m->m_next) ->m_len = 0; } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod and dtom * will work for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ #define MPFail (mbstat.m_mpfail) struct mbuf * m_pullup(n, len) register struct mbuf *n; int len; { register struct mbuf *m; register int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; MGET(m, M_DONTWAIT, n->m_type); if (m == 0) goto bad; m->m_len = 0; if (n->m_flags & M_PKTHDR) M_MOVE_PKTHDR(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); MPFail++; return (0); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(m0, len0, wait) register struct mbuf *m0; int len0, wait; { register struct mbuf *m, *n; unsigned len = len0, remain; for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == 0) return (0); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR) { MGETHDR(n, wait, m0->m_type); if (n == 0) return (0); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ MH_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == 0) { (void) m_free(n); return (0); } else { n->m_len = 0; return (n); } } else MH_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = 0; return (n); } else { MGET(n, wait, m->m_type); if (n == 0) return (0); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_flags |= M_EXT; n->m_ext = m->m_ext; if (m->m_ext.ext_ref == NULL) atomic_add_char(&mclrefcnt[mtocl(m->m_ext.ext_buf)], 1); else { int s = splimp(); (*m->m_ext.ext_ref)(m->m_ext.ext_buf, m->m_ext.ext_size); splx(s); } n->m_data = m->m_data + len; } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = 0; n = m_clreflimit(n, wait); return (n); } /* * Routine to copy from device local memory into mbufs. */ struct mbuf * m_devget(buf, totlen, off0, ifp, copy) char *buf; int totlen, off0; struct ifnet *ifp; void (*copy) __P((char *from, caddr_t to, u_int len)); { register struct mbuf *m; struct mbuf *top = 0, **mp = ⊤ register int off = off0, len; register char *cp; char *epkt; cp = buf; epkt = cp + totlen; if (off) { cp += off + 2 * sizeof(u_short); totlen -= 2 * sizeof(u_short); } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == 0) return (0); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; m->m_len = MHLEN; while (totlen > 0) { if (top) { MGET(m, M_DONTWAIT, MT_DATA); if (m == 0) { m_freem(top); return (0); } m->m_len = MLEN; } len = min(totlen, epkt - cp); if (len >= MINCLSIZE) { MCLGET(m, M_DONTWAIT); if (m->m_flags & M_EXT) m->m_len = len = min(len, MCLBYTES); else len = m->m_len; } else { /* * Place initial small packet/header at end of mbuf. */ if (len < m->m_len) { if (top == 0 && len + max_linkhdr <= m->m_len) m->m_data += max_linkhdr; m->m_len = len; } else len = m->m_len; } if (copy) copy(cp, mtod(m, caddr_t), (unsigned)len); else bcopy(cp, mtod(m, caddr_t), (unsigned)len); cp += len; *mp = m; mp = &m->m_next; totlen -= len; if (cp == epkt) cp = buf; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(m0, off, len, cp) struct mbuf *m0; register int off; register int len; caddr_t cp; { register int mlen; register struct mbuf *m = m0, *n; int totlen = 0; if (m0 == 0) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == 0) { n = m_getclr(M_DONTWAIT, m->m_type); if (n == 0) goto out; n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == 0) { n = m_get(M_DONTWAIT, m->m_type); if (n == 0) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } void m_print(const struct mbuf *m) { int len; const struct mbuf *m2; len = m->m_pkthdr.len; m2 = m; while (len) { printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-"); len -= m2->m_len; m2 = m2->m_next; } return; } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster")); to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) { to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == NULL) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_new) m_free(m_new); if (m_final) m_freem(m_final); return (NULL); } #define MAX_CLREFCOUNT 32 /* * Ensure that the number of mbuf cluster references stays less than our * desired amount by making a new copy of the entire chain. * * If a reference count has already gone negative, panic. */ static struct mbuf * m_clreflimit(struct mbuf *m0, int how) { struct mbuf *m; int maxrefs = 0; for (m = m0; m != NULL; m = m->m_next) { if ((m->m_flags & M_EXT) && (m->m_ext.ext_ref == NULL)) { maxrefs = max(maxrefs, mclrefcnt[mtocl(m->m_ext.ext_buf)]); KASSERT(mclrefcnt[mtocl(m->m_ext.ext_buf)] > 0, ("m_clreflimit: bad reference count: %d", mclrefcnt[mtocl(m->m_ext.ext_buf)])); } } if (maxrefs < MAX_CLREFCOUNT) return (m0); m_clreflimithits++; m = m_defrag(m0, how); /* Avoid returning NULL at all costs, m_split won't like it. */ if (m == NULL) return (m0); else return (m); } Index: stable/4/sys/sys/mbuf.h =================================================================== --- stable/4/sys/sys/mbuf.h (revision 118739) +++ stable/4/sys/sys/mbuf.h (revision 118740) @@ -1,712 +1,723 @@ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ #include /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size * MCLBYTES (also in machine/param.h), which has no additional overhead * and is used instead of the internal data area; this is done when * at least MINCLSIZE of data must be stored. */ #define MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define MHLEN (MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ #define MINCLSIZE (MHLEN + 1) /* smallest amount to put in cluster */ #define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ /* * Macros for type conversion: * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * dtom(x) -- Convert data pointer within mbuf to mbuf pointer (XXX). * mtocl(x) - convert pointer within cluster to cluster index # * cltom(x) - convert cluster # to ptr to beginning of cluster */ #define mtod(m, t) ((t)((m)->m_data)) #define dtom(x) ((struct mbuf *)((intptr_t)(x) & ~(MSIZE-1))) #define mtocl(x) (((uintptr_t)(x) - (uintptr_t)mbutl) >> MCLSHIFT) #define cltom(x) ((caddr_t)((uintptr_t)mbutl + \ ((uintptr_t)(x) << MCLSHIFT))) +#define mcl_valid(x) ((uintptr_t)(x) >= (uintptr_t)mbutl && \ + (uintptr_t)(x) < (uintptr_t)mbutltop) /* * Header present at the beginning of every mbuf. */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ caddr_t mh_data; /* location of data */ int mh_len; /* amount of data in this mbuf */ short mh_type; /* type of data in this mbuf */ short mh_flags; /* flags; see below */ }; /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. */ struct pkthdr { struct ifnet *rcvif; /* rcv interface */ int len; /* total packet length */ /* variables for ip and tcp reassembly */ void *header; /* pointer to packet header */ /* variables for hardware checksum */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ }; /* * Description of external storage mapped into mbuf; valid only if M_EXT is set. */ struct m_ext { caddr_t ext_buf; /* start of buffer */ void (*ext_free) /* free routine if not the usual */ (caddr_t, u_int); u_int ext_size; /* size of buffer, for ext_free */ void (*ext_ref) /* add a reference to the ext object */ (caddr_t, u_int); }; /* * The core of the mbuf object along with some shortcut defines for * practical purposes. */ struct mbuf { struct m_hdr m_hdr; union { struct { struct pkthdr MH_pkthdr; /* M_PKTHDR set */ union { struct m_ext MH_ext; /* M_EXT set */ char MH_databuf[MHLEN]; } MH_dat; } MH; char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ } M_dat; }; #define m_next m_hdr.mh_next #define m_len m_hdr.mh_len #define m_data m_hdr.mh_data #define m_type m_hdr.mh_type #define m_flags m_hdr.mh_flags #define m_nextpkt m_hdr.mh_nextpkt #define m_act m_nextpkt #define m_pkthdr M_dat.MH.MH_pkthdr #define m_ext M_dat.MH.MH_dat.MH_ext #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf /* * mbuf flags. */ #define M_EXT 0x0001 /* has associated external storage */ #define M_PKTHDR 0x0002 /* start of record */ #define M_EOR 0x0004 /* end of record */ #define M_PROTO1 0x0008 /* protocol-specific */ #define M_PROTO2 0x0010 /* protocol-specific */ #define M_PROTO3 0x0020 /* protocol-specific */ #define M_PROTO4 0x0040 /* protocol-specific */ #define M_PROTO5 0x0080 /* protocol-specific */ /* * mbuf pkthdr flags (also stored in m_flags). */ #define M_BCAST 0x0100 /* send/received as link-level broadcast */ #define M_MCAST 0x0200 /* send/received as link-level multicast */ #define M_FRAG 0x0400 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x0800 /* packet is first fragment */ #define M_LASTFRAG 0x1000 /* packet is last fragment */ /* * Flags copied when copying m_pkthdr. */ #define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO1|M_PROTO2|M_PROTO3 | \ M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ M_FIRSTFRAG|M_LASTFRAG) /* * Flags indicating hw checksum support and sw checksum requirements. */ #define CSUM_IP 0x0001 /* will csum IP */ #define CSUM_TCP 0x0002 /* will csum TCP */ #define CSUM_UDP 0x0004 /* will csum UDP */ #define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ #define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ #define CSUM_IP_CHECKED 0x0100 /* did csum IP */ #define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ #define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ #define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ #define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) #define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ /* * mbuf types. */ #define MT_FREE 0 /* should be on free list */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER 2 /* packet header */ #if 0 #define MT_SOCKET 3 /* socket structure */ #define MT_PCB 4 /* protocol control block */ #define MT_RTABLE 5 /* routing tables */ #define MT_HTABLE 6 /* IMP host tables */ #define MT_ATABLE 7 /* address resolution tables */ #endif #define MT_SONAME 8 /* socket name */ #if 0 #define MT_SOOPTS 10 /* socket options */ #endif #define MT_FTABLE 11 /* fragment reassembly header */ #if 0 #define MT_RIGHTS 12 /* access rights */ #define MT_IFADDR 13 /* interface address */ #endif #define MT_TAG 13 /* volatile metadata associated to pkts */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_OOBDATA 15 /* expedited data */ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ /* * General mbuf allocator statistics structure. */ struct mbstat { u_long m_mbufs; /* mbufs obtained from page pool */ u_long m_clusters; /* clusters obtained from page pool */ u_long m_spare; /* spare field */ u_long m_clfree; /* free clusters */ u_long m_drops; /* times failed to find space */ u_long m_wait; /* times waited for space */ u_long m_drain; /* times drained protocols for space */ u_long m_mcfail; /* times m_copym failed */ u_long m_mpfail; /* times m_pullup failed */ u_long m_msize; /* length of an mbuf */ u_long m_mclbytes; /* length of an mbuf cluster */ u_long m_minclsize; /* min length of data to allocate a cluster */ u_long m_mlen; /* length of data in an mbuf */ u_long m_mhlen; /* length of data in a header mbuf */ }; /* * Flags specifying how an allocation should be made. */ #define M_DONTWAIT 1 #define M_WAIT 0 /* Freelists: * * Normal mbuf clusters are normally treated as character arrays * after allocation, but use the first word of the buffer as a free list * pointer while on the free list. */ union mcluster { union mcluster *mcl_next; char mcl_buf[MCLBYTES]; }; /* * These are identifying numbers passed to the m_mballoc_wait function, * allowing us to determine whether the call came from an MGETHDR or * an MGET. */ #define MGETHDR_C 1 #define MGET_C 2 /* * Wake up the next instance (if any) of m_mballoc_wait() which is * waiting for an mbuf to be freed. This should be called at splimp(). * * XXX: If there is another free mbuf, this routine will be called [again] * from the m_mballoc_wait routine in order to wake another sleep instance. */ #define MMBWAKEUP() do { \ if (m_mballoc_wid) { \ m_mballoc_wid--; \ wakeup_one(&m_mballoc_wid); \ } \ } while (0) /* * Same as above, but for mbuf cluster(s). */ #define MCLWAKEUP() do { \ if (m_clalloc_wid) { \ m_clalloc_wid--; \ wakeup_one(&m_clalloc_wid); \ } \ } while (0) /* * mbuf utility macros: * * MBUFLOCK(code) * prevents a section of code from from being interrupted by network * drivers. */ #define MBUFLOCK(code) do { \ int _ms = splimp(); \ \ { code } \ splx(_ms); \ } while (0) /* * mbuf allocation/deallocation macros: * * MGET(struct mbuf *m, int how, int type) * allocates an mbuf and initializes it to contain internal data. * * MGETHDR(struct mbuf *m, int how, int type) * allocates an mbuf and initializes it to contain a packet header * and internal data. */ #define MGET(m, how, type) do { \ struct mbuf *_mm; \ int _mhow = (how); \ int _mtype = (type); \ int _ms = splimp(); \ \ if (mmbfree == NULL) \ (void)m_mballoc(1, _mhow); \ _mm = mmbfree; \ if (_mm != NULL) { \ mmbfree = _mm->m_next; \ mbtypes[MT_FREE]--; \ _mm->m_type = _mtype; \ mbtypes[_mtype]++; \ _mm->m_next = NULL; \ _mm->m_nextpkt = NULL; \ _mm->m_data = _mm->m_dat; \ _mm->m_flags = 0; \ (m) = _mm; \ splx(_ms); \ } else { \ splx(_ms); \ _mm = m_retry(_mhow, _mtype); \ if (_mm == NULL && _mhow == M_WAIT) \ (m) = m_mballoc_wait(MGET_C, _mtype); \ else \ (m) = _mm; \ } \ } while (0) #define MGETHDR(m, how, type) do { \ struct mbuf *_mm; \ int _mhow = (how); \ int _mtype = (type); \ int _ms = splimp(); \ \ if (mmbfree == NULL) \ (void)m_mballoc(1, _mhow); \ _mm = mmbfree; \ if (_mm != NULL) { \ mmbfree = _mm->m_next; \ mbtypes[MT_FREE]--; \ _mm->m_type = _mtype; \ mbtypes[_mtype]++; \ _mm->m_next = NULL; \ _mm->m_nextpkt = NULL; \ _mm->m_data = _mm->m_pktdat; \ _mm->m_flags = M_PKTHDR; \ _mm->m_pkthdr.rcvif = NULL; \ SLIST_INIT(&_mm->m_pkthdr.tags); \ _mm->m_pkthdr.csum_flags = 0; \ (m) = _mm; \ splx(_ms); \ } else { \ splx(_ms); \ _mm = m_retryhdr(_mhow, _mtype); \ if (_mm == NULL && _mhow == M_WAIT) \ (m) = m_mballoc_wait(MGETHDR_C, _mtype); \ else \ (m) = _mm; \ } \ } while (0) /* * Mbuf cluster macros. * MCLALLOC(caddr_t p, int how) allocates an mbuf cluster. * MCLGET adds such clusters to a normal mbuf; * the flag M_EXT is set upon success. * MCLFREE releases a reference to a cluster allocated by MCLALLOC, * freeing the cluster if the reference count has reached 0. */ #define MCLALLOC(p, how) do { \ caddr_t _mp; \ int _mhow = (how); \ int _ms = splimp(); \ \ if (mclfree == NULL) \ (void)m_clalloc(1, _mhow); \ _mp = (caddr_t)mclfree; \ if (_mp != NULL) { \ + KASSERT(mcl_valid(_mp), \ + ("MCLALLOC junk pointer: %x < %x < %x.", \ + (uintptr_t)mbutl, (uintptr_t)_mp, \ + (uintptr_t)mbutltop)); \ KASSERT(mclrefcnt[mtocl(_mp)] == 0, \ ("free cluster with refcount %d.", \ mclrefcnt[mtocl(_mp)])); \ mclrefcnt[mtocl(_mp)]++; \ mbstat.m_clfree--; \ mclfree = ((union mcluster *)_mp)->mcl_next; \ (p) = _mp; \ splx(_ms); \ } else { \ splx(_ms); \ if (_mhow == M_WAIT) \ (p) = m_clalloc_wait(); \ else \ (p) = NULL; \ } \ } while (0) #define MCLGET(m, how) do { \ struct mbuf *_mm = (m); \ \ MCLALLOC(_mm->m_ext.ext_buf, (how)); \ if (_mm->m_ext.ext_buf != NULL) { \ _mm->m_data = _mm->m_ext.ext_buf; \ _mm->m_flags |= M_EXT; \ _mm->m_ext.ext_free = NULL; \ _mm->m_ext.ext_ref = NULL; \ _mm->m_ext.ext_size = MCLBYTES; \ } \ } while (0) #define MCLFREE1(p) do { \ union mcluster *_mp = (union mcluster *)(p); \ \ + KASSERT(mcl_valid(_mp), \ + ("MCLFREE1 junk pointer: %x < %x < %x.", \ + (uintptr_t)mbutl, (uintptr_t)_mp, \ + (uintptr_t)mbutltop)); \ KASSERT(mclrefcnt[mtocl(_mp)] > 0, \ ("freeing free cluster, refcount: %d.", \ mclrefcnt[mtocl(_mp)])); \ if (--mclrefcnt[mtocl(_mp)] == 0) { \ _mp->mcl_next = mclfree; \ mclfree = _mp; \ mbstat.m_clfree++; \ MCLWAKEUP(); \ } \ } while (0) #define MCLFREE(p) MBUFLOCK( \ MCLFREE1(p); \ ) #define MEXTFREE1(m) do { \ struct mbuf *_mm = (m); \ \ if (_mm->m_ext.ext_free != NULL) \ (*_mm->m_ext.ext_free)(_mm->m_ext.ext_buf, \ _mm->m_ext.ext_size); \ else \ MCLFREE1(_mm->m_ext.ext_buf); \ } while (0) #define MEXTFREE(m) MBUFLOCK( \ MEXTFREE1(m); \ ) /* * NB: M_COPY_PKTHDR is deprecated; use either M_MOVE_PKTHDR * or m_dup_pkthdr. */ /* * Move mbuf pkthdr from "from" to "to". * from should have M_PKTHDR set, and to must be empty. * from no longer has a pkthdr after this operation. */ #define M_MOVE_PKTHDR(_to, _from) m_move_pkthdr((_to), (_from)) /* * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. */ #define M_ALIGN(m, len) do { \ (m)->m_data += (MLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ #define MH_ALIGN(m, len) do { \ (m)->m_data += (MHLEN - (len)) & ~(sizeof(long) - 1); \ } while (0) /* * Check if we can write to an mbuf. */ #define M_EXT_WRITABLE(m) \ ((m)->m_ext.ext_free == NULL && mclrefcnt[mtocl((m)->m_ext.ext_buf)] == 1) #define M_WRITABLE(m) (!((m)->m_flags & M_EXT) || \ M_EXT_WRITABLE(m) ) /* * Compute the amount of space available * before the current start of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_LEADINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_EXT_WRITABLE(m) ? (m)->m_data - (m)->m_ext.ext_buf : 0): \ (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat : \ (m)->m_data - (m)->m_dat) /* * Compute the amount of space available * after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. */ #define M_TRAILINGSPACE(m) \ ((m)->m_flags & M_EXT ? \ (M_WRITABLE(m) ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size \ - ((m)->m_data + (m)->m_len) : 0) : \ &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len)) /* * Arrange to prepend space of size plen to mbuf m. * If a new mbuf must be allocated, how specifies whether to wait. * If how is M_DONTWAIT and allocation fails, the original mbuf chain * is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* change mbuf to new type */ #define MCHTYPE(m, t) do { \ struct mbuf *_mm = (m); \ int _mt = (t); \ int _ms = splimp(); \ \ mbtypes[_mm->m_type]--; \ mbtypes[_mt]++; \ splx(_ms); \ _mm->m_type = (_mt); \ } while (0) /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 /* Compatibility with 4.3 */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) #ifdef _KERNEL extern u_int m_clalloc_wid; /* mbuf cluster wait count */ extern u_int m_mballoc_wid; /* mbuf wait count */ extern int max_linkhdr; /* largest link-level header */ extern int max_protohdr; /* largest protocol header */ extern int max_hdr; /* largest link+protocol header */ extern int max_datalen; /* MHLEN - max_hdr */ extern struct mbstat mbstat; extern u_long mbtypes[MT_NTYPES]; /* per-type mbuf allocations */ extern int mbuf_wait; /* mbuf sleep time */ extern struct mbuf *mbutl; /* virtual address of mclusters */ +extern struct mbuf *mbutltop; /* highest address of mclusters */ extern char *mclrefcnt; /* cluster reference counts */ extern union mcluster *mclfree; extern struct mbuf *mmbfree; extern int nmbclusters; extern int nmbufs; extern int nsfbufs; void m_adj(struct mbuf *, int); void m_cat(struct mbuf *, struct mbuf *); int m_clalloc(int, int); caddr_t m_clalloc_wait(void); void m_copyback(struct mbuf *, int, int, caddr_t); void m_copydata(struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); struct mbuf *m_defrag(struct mbuf *, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*copy)(char *, caddr_t, u_int)); struct mbuf *m_dup(struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, struct mbuf *, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_free(struct mbuf *); void m_freem(struct mbuf *); struct mbuf *m_get(int, int); struct mbuf *m_getcl(int how, short type, int flags); struct mbuf *m_getclr(int, int); struct mbuf *m_gethdr(int, int); struct mbuf *m_getm(struct mbuf *, int, int, int); u_int m_length(struct mbuf *, struct mbuf **); int m_mballoc(int, int); struct mbuf *m_mballoc_wait(int, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *m); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); struct mbuf *m_retry(int, int); struct mbuf *m_retryhdr(int, int); struct mbuf *m_split(struct mbuf *, int, int); /* * Packets may have annotations attached by affixing a list * of "packet tags" to the pkthdr structure. Packet tags are * dynamically allocated semi-opaque data structures that have * a fixed header (struct m_tag) that specifies the size of the * memory block and a pair that identifies it. * The cookie is a 32-bit unique unsigned value used to identify * a module or ABI. By convention this value is chose as the * date+time that the module is created, expressed as the number of * seconds since the epoch (e.g. using date -u +'%s'). The type value * is an ABI/module-specific value that identifies a particular annotation * and is private to the module. For compatibility with systems * like openbsd that define packet tags w/o an ABI/module cookie, * the value PACKET_ABI_COMPAT is used to implement m_tag_get and * m_tag_find compatibility shim functions and several tag types are * defined below. Users that do not require compatibility should use * a private cookie value so that packet tag-related definitions * can be maintained privately. * * Note that the packet tag returned by m_tag_allocate has the default * memory alignment implemented by malloc. To reference private data * one can use a construct like: * * struct m_tag *mtag = m_tag_allocate(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members * of struct foo. Otherwise it is necessary to embed struct m_tag within * the private data structure to insure proper alignment; e.g. * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_allocate(...); * struct m_tag *mtag = &p->tag; */ #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tag for use with PACKET_ABI_COMPAT */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ /* * As a temporary and low impact solution to replace the even uglier * approach used so far in some parts of the network stack (which relies * on global variables), packet tag-like annotations are stored in MT_TAG * mbufs (or lookalikes) prepended to the actual mbuf chain. * * m_type = MT_TAG * m_flags = m_tag_id * m_next = next buffer in chain. * * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. */ #define _m_tag_id m_hdr.mh_flags /* Packet tags used in the FreeBSD network stack */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_IPFW 16 /* ipfw classification */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ /* Packet tag routines */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_free(struct m_tag *); void m_tag_prepend(struct mbuf *, struct m_tag *); void m_tag_unlink(struct mbuf *, struct m_tag *); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, struct mbuf *, int); void m_tag_init(struct mbuf *); struct m_tag *m_tag_first(struct mbuf *); struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); /* these are for openbsd compatibility */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return m_tag_locate(m, MTAG_ABI_COMPAT, type, start); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */