Index: stable/6/sys/kern/imgact_aout.c =================================================================== --- stable/6/sys/kern/imgact_aout.c (revision 156759) +++ stable/6/sys/kern/imgact_aout.c (revision 156760) @@ -1,259 +1,270 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_aout_imgact(struct image_params *imgp); static int aout_fixup(register_t **stack_base, struct image_params *imgp); struct sysentvec aout_sysvec = { SYS_MAXSYSCALL, sysent, 0, 0, NULL, 0, NULL, NULL, aout_fixup, sendsig, sigcode, &szsigcode, NULL, "FreeBSD a.out", NULL, NULL, MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs, NULL }; static int aout_fixup(stack_base, imgp) register_t **stack_base; struct image_params *imgp; { return (suword(--(*stack_base), imgp->args->argc)); } static int exec_aout_imgact(imgp) struct image_params *imgp; { const struct exec *a_out = (const struct exec *) imgp->image_header; + struct thread *td = curthread; struct vmspace *vmspace; - struct vnode *vp; vm_map_t map; vm_object_t object; vm_offset_t text_end, data_end; unsigned long virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. * NetBSD is in network byte order.. ugh. */ if (((a_out->a_magic >> 16) & 0xff) != 0x86 && ((a_out->a_magic >> 16) & 0xff) != 0 && ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) return -1; /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_magic & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; /* Pass PS_STRINGS for BSD/OS binaries only. */ if (N_GETMID(*a_out) == MID_ZERO) imgp->ps_strings = aout_sysvec.sv_psstrings; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = PAGE_SIZE; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ PROC_LOCK(imgp->proc); if (/* text can't exceed maximum text size */ a_out->a_text > maxtsiz || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } PROC_UNLOCK(imgp->proc); /* + * Avoid a possible deadlock if the current address space is destroyed + * and that address space maps the locked vnode. In the common case, + * the locked vnode's v_usecount is decremented but remains greater + * than zero. Consequently, the vnode lock is not needed by vrele(). + * However, in cases where the vnode lock is external, such as nullfs, + * v_usecount may become zero. + */ + VOP_UNLOCK(imgp->vp, 0, td); + + /* * Destroy old process VM and create a new one (with a new stack) */ exec_new_vmspace(imgp, &aout_sysvec); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + /* * The vm space can be changed by exec_new_vmspace */ vmspace = imgp->proc->p_vmspace; - vp = imgp->vp; object = imgp->object; map = &vmspace->vm_map; vm_map_lock(map); vm_object_reference(object); text_end = virtual_offset + a_out->a_text; error = vm_map_insert(map, object, file_offset, virtual_offset, text_end, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); return (error); } data_end = text_end + a_out->a_data; if (a_out->a_data) { vm_object_reference(object); error = vm_map_insert(map, object, file_offset + a_out->a_text, text_end, data_end, VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_PREFAULT); if (error) { vm_map_unlock(map); return (error); } } if (bss_size) { error = vm_map_insert(map, NULL, 0, data_end, data_end + bss_size, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_map_unlock(map); return (error); } } vm_map_unlock(map); /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (virtual_offset + a_out->a_text); /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; return (0); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; EXEC_SET(aout, aout_execsw); Index: stable/6/sys/kern/imgact_elf.c =================================================================== --- stable/6/sys/kern/imgact_elf.c (revision 156759) +++ stable/6/sys/kern/imgact_elf.c (revision 156760) @@ -1,1342 +1,1295 @@ /*- * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 #include #include #endif #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); -static int __elfN(load_section)(struct proc *p, - struct vmspace *vmspace, struct vnode *vp, vm_object_t object, +static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand", &__elfN(fallback_brand)); static int elf_trace = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, ""); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, ""); static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp) { Elf_Brandinfo *bi; int i; /* * We support three types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, and (3) path of the `interp_path' * field. We should also look for an ".note.ABI-tag" ELF section now * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones. */ /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0)) return (bi); } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && strcmp(interp, bi->interp_path) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, - vm_offset_t start, vm_offset_t end, vm_prot_t prot, - vm_prot_t max) + vm_offset_t start, vm_offset_t end, vm_prot_t prot) { - int error, rv; + struct sf_buf *sf; + int error; vm_offset_t off; - vm_offset_t data_buf = 0; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_lock(map); - vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), max, - max, 0); + vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), + VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Find the page from the underlying object. */ if (object) { - vm_object_reference(object); - rv = vm_map_find(exec_map, - object, - trunc_page(offset), - &data_buf, - PAGE_SIZE, - TRUE, - VM_PROT_READ, - VM_PROT_ALL, - MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); - if (rv != KERN_SUCCESS) { - vm_object_deallocate(object); - return (rv); - } - + sf = vm_imgact_map_page(object, offset); + if (sf == NULL) + return (KERN_FAILURE); off = offset - trunc_page(offset); - error = copyout((caddr_t)data_buf + off, (caddr_t)start, + error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); - vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE); + vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } } return (KERN_SUCCESS); } static int __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, - vm_offset_t start, vm_offset_t end, vm_prot_t prot, - vm_prot_t max, int cow) + vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { - vm_offset_t data_buf, off; + struct sf_buf *sf; + vm_offset_t off; vm_size_t sz; int error, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, - round_page(start), prot, max); + round_page(start), prot); if (rv) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + - trunc_page(end) - start, trunc_page(end), end, prot, max); + trunc_page(end) - start, trunc_page(end), end, prot); if (rv) return (rv); end = trunc_page(end); } if (end > start) { if (offset & PAGE_MASK) { /* * The mapping is not page aligned. This means we have * to copy the data. Sigh. */ - rv = vm_map_find(map, 0, 0, &start, end - start, - FALSE, prot, max, 0); + rv = vm_map_find(map, NULL, 0, &start, end - start, + FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0); if (rv) return (rv); - data_buf = 0; - while (start < end) { - vm_object_reference(object); - rv = vm_map_find(exec_map, - object, - trunc_page(offset), - &data_buf, - 2 * PAGE_SIZE, - TRUE, - VM_PROT_READ, - VM_PROT_ALL, - (MAP_COPY_ON_WRITE - | MAP_PREFAULT_PARTIAL)); - if (rv != KERN_SUCCESS) { - vm_object_deallocate(object); - return (rv); - } + if (object == NULL) + return (KERN_SUCCESS); + for (; start < end; start += sz) { + sf = vm_imgact_map_page(object, offset); + if (sf == NULL) + return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; - if (sz > PAGE_SIZE) - sz = PAGE_SIZE; - error = copyout((caddr_t)data_buf + off, + if (sz > PAGE_SIZE - off) + sz = PAGE_SIZE - off; + error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); - vm_map_remove(exec_map, data_buf, - data_buf + 2 * PAGE_SIZE); + vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } - start += sz; + offset += sz; } rv = KERN_SUCCESS; } else { + vm_object_reference(object); vm_map_lock(map); rv = vm_map_insert(map, object, offset, start, end, - prot, max, cow); + prot, VM_PROT_ALL, cow); vm_map_unlock(map); + if (rv != KERN_SUCCESS) + vm_object_deallocate(object); } return (rv); } else { return (KERN_SUCCESS); } } static int -__elfN(load_section)(struct proc *p, struct vmspace *vmspace, - struct vnode *vp, vm_object_t object, vm_offset_t offset, +__elfN(load_section)(struct vmspace *vmspace, + vm_object_t object, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { + struct sf_buf *sf; size_t map_len; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_offset_t file_addr; - vm_offset_t data_buf = 0; - error = 0; - /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } #define trunc_page_ps(va, ps) ((va) & ~(ps - 1)) #define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1)) map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second.. */ if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { - vm_object_reference(object); - /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(&vmspace->vm_map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, - VM_PROT_ALL, cow); - if (rv != KERN_SUCCESS) { - vm_object_deallocate(object); + if (rv != KERN_SUCCESS) return (EINVAL); - } /* we can stop now if we've covered it all */ if (memsz == filsz) { return (0); } } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr, - map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0); + map_addr + map_len, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (EINVAL); } } if (copy_len != 0) { vm_offset_t off; - vm_object_reference(object); - rv = vm_map_find(exec_map, - object, - trunc_page(offset + filsz), - &data_buf, - PAGE_SIZE, - TRUE, - VM_PROT_READ, - VM_PROT_ALL, - MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); - if (rv != KERN_SUCCESS) { - vm_object_deallocate(object); - return (EINVAL); - } + sf = vm_imgact_map_page(object, offset + filsz); + if (sf == NULL) + return (EIO); + /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); - error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr, - copy_len); - vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE); + error = copyout((caddr_t)sf_buf_kva(sf) + off, + (caddr_t)map_addr, copy_len); + vm_imgact_unmap_page(sf); if (error) { return (error); } } /* * set it to the specified protection. * XXX had better undo the damage from pasting over the cracks here! */ vm_map_protect(&vmspace->vm_map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); - return (error); + return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vmspace *vmspace = p->p_vmspace; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int vfslocked, error, i, numsegs; if (curthread->td_proc != p) panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; /* XXXKSE */ NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); vfslocked = 0; if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } vfslocked = NDHASGIANT(nd); NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); - if (error) { - VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */ + if (error) goto fail; - } error = exec_map_first_page(imgp); + if (error) + goto fail; + /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ - if (error == 0) - nd->ni_vp->v_vflag |= VV_TEXT; + nd->ni_vp->v_vflag |= VV_TEXT; imgp->object = nd->ni_vp->v_object; - vm_object_reference(imgp->object); - VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */ - if (error) - goto fail; - hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ /* (multiplication of two Elf_Half fields will not overflow) */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; - if ((error = __elfN(load_section)(p, vmspace, - nd->ni_vp, imgp->object, phdr[i].p_offset, + if ((error = __elfN(load_section)(vmspace, + imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize)) != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); - if (imgp->object) - vm_object_deallocate(imgp->object); if (nd->ni_vp) - vrele(nd->ni_vp); + vput(nd->ni_vp); VFS_UNLOCK_GIANT(vfslocked); free(tempdata, M_TEMP); return (error); } static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; const Elf_Phdr *phdr; - Elf_Auxargs *elf_auxargs = NULL; + Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; vm_prot_t prot; u_long text_size = 0, data_size = 0, total_size = 0; u_long text_addr = 0, data_addr = 0; u_long seg_size, seg_addr; u_long addr, entry = 0, proghdr = 0; int error = 0, i; const char *interp = NULL; Elf_Brandinfo *brand_info; char *path; struct thread *td = curthread; struct sysentvec *sv; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { /* Only support headers in first page for now */ return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); - - /* - * From this point on, we may have resources that need to be freed. - */ - - VOP_UNLOCK(imgp->vp, 0, td); - for (i = 0; i < hdr->e_phnum; i++) { - switch (phdr[i].p_type) { - case PT_INTERP: /* Path to interpreter */ + if (phdr[i].p_type == PT_INTERP) { + /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN || - phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) { - error = ENOEXEC; - goto fail; - } + phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) + return (ENOEXEC); interp = imgp->image_header + phdr[i].p_offset; break; - default: - break; } } brand_info = __elfN(get_brandinfo)(hdr, interp); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); - error = ENOEXEC; - goto fail; + return (ENOEXEC); } - if (hdr->e_type == ET_DYN && brand_info->brand != ELFOSABI_LINUX) { - error = ENOEXEC; - goto fail; - } + if (hdr->e_type == ET_DYN && brand_info->brand != ELFOSABI_LINUX) + return (ENOEXEC); sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) interp = brand_info->interp_newpath; + /* + * Avoid a possible deadlock if the current address space is destroyed + * and that address space maps the locked vnode. In the common case, + * the locked vnode's v_usecount is decremented but remains greater + * than zero. Consequently, the vnode lock is not needed by vrele(). + * However, in cases where the vnode lock is external, such as nullfs, + * v_usecount may become zero. + */ + VOP_UNLOCK(imgp->vp, 0, td); + exec_new_vmspace(imgp, sv); + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); + vmspace = imgp->proc->p_vmspace; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ prot = 0; if (phdr[i].p_flags & PF_X) prot |= VM_PROT_EXECUTE; if (phdr[i].p_flags & PF_W) prot |= VM_PROT_WRITE; if (phdr[i].p_flags & PF_R) prot |= VM_PROT_READ; #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER) /* * Some x86 binaries assume read == executable, * notably the M3 runtime and therefore cvsup */ if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif - if ((error = __elfN(load_section)(imgp->proc, vmspace, - imgp->vp, imgp->object, phdr[i].p_offset, + if ((error = __elfN(load_section)(vmspace, + imgp->object, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize)) != 0) - goto fail; + return (error); /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff; seg_addr = trunc_page(phdr[i].p_vaddr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr - seg_addr); /* * Is this .text or .data? We can't use * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the * alpha terribly and possibly does other bad * things so we stick to the old way of figuring * it out: If the segment contains the program * entry point, it's a text segment, otherwise it * is a data segment. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (hdr->e_entry >= phdr[i].p_vaddr && hdr->e_entry < (phdr[i].p_vaddr + phdr[i].p_memsz)) { text_size = seg_size; text_addr = seg_addr; entry = (u_long)hdr->e_entry; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) || text_size > maxtsiz || total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) { PROC_UNLOCK(imgp->proc); - error = ENOMEM; - goto fail; + return (ENOMEM); } vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr + lim_max(imgp->proc, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; imgp->proc->p_sysent = sv; - if (interp != NULL && brand_info->emul_path != NULL && - brand_info->emul_path[0] != '\0') { - path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); - snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, - interp); - error = __elfN(load_file)(imgp->proc, path, &addr, - &imgp->entry_addr, sv->sv_pagesize); - free(path, M_TEMP); - if (error == 0) - interp = NULL; - } if (interp != NULL) { - error = __elfN(load_file)(imgp->proc, interp, &addr, - &imgp->entry_addr, sv->sv_pagesize); + VOP_UNLOCK(imgp->vp, 0, td); + if (brand_info->emul_path != NULL && + brand_info->emul_path[0] != '\0') { + path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + snprintf(path, MAXPATHLEN, "%s%s", + brand_info->emul_path, interp); + error = __elfN(load_file)(imgp->proc, path, &addr, + &imgp->entry_addr, sv->sv_pagesize); + free(path, M_TEMP); + if (error == 0) + interp = NULL; + } + if (interp != NULL) { + error = __elfN(load_file)(imgp->proc, interp, &addr, + &imgp->entry_addr, sv->sv_pagesize); + } + vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) { uprintf("ELF interpreter %s not found\n", interp); - goto fail; + return (error); } } /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->trace = elf_trace; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; -fail: - vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->trace) { AUXARGS_ENTRY(pos, AT_DEBUG, 1); } if (args->execfd != -1) { AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); } AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static void each_writable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *, int, void *, size_t); static void __elfN(puthdr)(struct thread *, void *, size_t *, int); static void __elfN(putnote)(void *, size_t *, const char *, int, const void *, size_t); extern int osreldate; int __elfN(coredump)(td, vp, limit) struct thread *td; struct vnode *vp; off_t limit; { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; void *hdr; size_t hdrsize; /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_writable_segment(td, cb_size_segment, &seginfo); /* * Calculate the size of the core file header area by making * a dry run of generating it. Nothing is written, but the * size is calculated. */ hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); if (hdrsize + seginfo.size >= limit) return (EFAULT); /* * Allocate memory for building the header, fill it up, * and write it out. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); if (hdr == NULL) { return (EINVAL); } error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = hdrsize; for (i = 0; i < seginfo.count; i++) { error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, UIO_USERSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, curthread); /* XXXKSE */ if (error != 0) break; offset += php->p_filesz; php++; } } free(hdr, M_TEMP); return (error); } /* * A callback for each_writable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = 0; if (entry->protection & VM_PROT_READ) phdr->p_flags |= PF_R; if (entry->protection & VM_PROT_WRITE) phdr->p_flags |= PF_W; if (entry->protection & VM_PROT_EXECUTE) phdr->p_flags |= PF_X; phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_writable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(entry, closure) vm_map_entry_t entry; void *closure; { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_writable_segment(td, func, closure) struct thread *td; segment_callback func; void *closure; { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj; /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((obj = entry->object.vm_object) == NULL) continue; /* Find the deepest backing object. */ while (obj->backing_object != NULL) obj = obj->backing_object; /* Ignore memory-mapped devices and such things. */ if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP && obj->type != OBJT_VNODE) continue; (*func)(entry, closure); } } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize) struct thread *td; struct vnode *vp; struct ucred *cred; int numsegs; size_t hdrsize; void *hdr; { size_t off; /* Fill in the header. */ bzero(hdr, hdrsize); off = 0; __elfN(puthdr)(td, hdr, &off, numsegs); /* Write it to the core file. */ return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td)); /* XXXKSE */ } #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; #endif static void __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs) { struct { elf_prstatus_t status; elf_prfpregset_t fpregset; elf_prpsinfo_t psinfo; } *tempdata; elf_prstatus_t *status; elf_prfpregset_t *fpregset; elf_prpsinfo_t *psinfo; struct proc *p; struct thread *thr; size_t ehoff, noteoff, notesz, phoff; p = td->td_proc; ehoff = *off; *off += sizeof(Elf_Ehdr); phoff = *off; *off += (numsegs + 1) * sizeof(Elf_Phdr); noteoff = *off; /* * Don't allocate space for the notes if we're just calculating * the size of the header. We also don't collect the data. */ if (dst != NULL) { tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK); status = &tempdata->status; fpregset = &tempdata->fpregset; psinfo = &tempdata->psinfo; } else { tempdata = NULL; status = NULL; fpregset = NULL; psinfo = NULL; } if (dst != NULL) { psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); /* * XXX - We don't fill in the command line arguments properly * yet. */ strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); } __elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, sizeof *psinfo); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { if (dst != NULL) { status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = p->p_sig; status->pr_pid = thr->td_tid; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 fill_regs32(thr, &status->pr_reg); fill_fpregs32(thr, fpregset); #else fill_regs(thr, &status->pr_reg); fill_fpregs(thr, fpregset); #endif } __elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status, sizeof *status); __elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset, sizeof *fpregset); /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ __elfN(dump_thread)(thr, dst, off); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } notesz = *off - noteoff; if (dst != NULL) free(tempdata, M_TEMP); /* Align up to a page boundary for the program segments. */ *off = round_page(*off); if (dst != NULL) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; struct phdr_closure phc; /* * Fill in the ELF header. */ ehdr = (Elf_Ehdr *)((char *)dst + ehoff); ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32 ehdr->e_machine = EM_386; #else ehdr->e_machine = ELF_ARCH; #endif ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = phoff; ehdr->e_flags = 0; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_phnum = numsegs + 1; ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shnum = 0; ehdr->e_shstrndx = SHN_UNDEF; /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)dst + phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = noteoff; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = *off; each_writable_segment(td, cb_put_phdr, &phc); } } static void __elfN(putnote)(void *dst, size_t *off, const char *name, int type, const void *desc, size_t descsz) { Elf_Note note; note.n_namesz = strlen(name) + 1; note.n_descsz = descsz; note.n_type = type; if (dst != NULL) bcopy(¬e, (char *)dst + *off, sizeof note); *off += sizeof note; if (dst != NULL) bcopy(name, (char *)dst + *off, note.n_namesz); *off += roundup2(note.n_namesz, sizeof(Elf_Size)); if (dst != NULL) bcopy(desc, (char *)dst + *off, note.n_descsz); *off += roundup2(note.n_descsz, sizeof(Elf_Size)); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); Index: stable/6/sys/kern/imgact_gzip.c =================================================================== --- stable/6/sys/kern/imgact_gzip.c (revision 156759) +++ stable/6/sys/kern/imgact_gzip.c (revision 156760) @@ -1,386 +1,399 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- */ /* * This module handles execution of a.out files which have been run through * "gzip". This saves diskspace, but wastes cpu-cycles and VM. * * TODO: * text-segments should be made R/O after being filled * is the vm-stuff safe ? * should handle the entire header of gzip'ed stuff. * inflate isn't quite reentrant yet... * error-handling is a mess... * so is the rest... * tidy up unnecesary includes */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct imgact_gzip { struct image_params *ip; struct exec a_out; int error; int gotheader; int where; u_char *inbuf; u_long offset; u_long output; u_long len; int idx; u_long virtual_offset, file_offset, file_end, bss_size; }; static int exec_gzip_imgact(struct image_params *imgp); static int NextByte(void *vp); static int do_aout_hdr(struct imgact_gzip *); static int Flush(void *vp, u_char *, u_long siz); static int exec_gzip_imgact(imgp) struct image_params *imgp; { int error, error2 = 0; const u_char *p = (const u_char *) imgp->image_header; struct imgact_gzip igz; struct inflate infl; struct vmspace *vmspace; /* If these four are not OK, it isn't a gzip file */ if (p[0] != 0x1f) return -1; /* 0 Simply magic */ if (p[1] != 0x8b) return -1; /* 1 Simply magic */ if (p[2] != 0x08) return -1; /* 2 Compression method */ if (p[9] != 0x03) return -1; /* 9 OS compressed on */ /* * If this one contains anything but a comment or a filename marker, * we don't want to chew on it */ if (p[3] & ~(0x18)) return ENOEXEC; /* 3 Flags */ /* These are of no use to us */ /* 4-7 Timestamp */ /* 8 Extra flags */ bzero(&igz, sizeof igz); bzero(&infl, sizeof infl); infl.gz_private = (void *) &igz; infl.gz_input = NextByte; infl.gz_output = Flush; igz.ip = imgp; igz.idx = 10; if (p[3] & 0x08) { /* skip a filename */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } if (p[3] & 0x10) { /* skip a comment */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } igz.len = imgp->attr->va_size; error = inflate(&infl); /* * The unzipped file may not even have been long enough to contain * a header giving Flush() a chance to return error. Check for this. */ if ( !igz.gotheader ) return ENOEXEC; if ( !error ) { vmspace = imgp->proc->p_vmspace; error = vm_map_protect(&vmspace->vm_map, (vm_offset_t) vmspace->vm_taddr, (vm_offset_t) (vmspace->vm_taddr + (vmspace->vm_tsize << PAGE_SHIFT)) , VM_PROT_READ|VM_PROT_EXECUTE,0); } if (igz.inbuf) { error2 = vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, (vm_offset_t) igz.inbuf + PAGE_SIZE); } if (igz.error || error || error2) { printf("Output=%lu ", igz.output); printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", error, igz.error, error2, igz.where); } if (igz.error) return igz.error; if (error) return ENOEXEC; if (error2) return error2; return 0; } static int do_aout_hdr(struct imgact_gzip * gz) { int error; + struct thread *td = curthread; struct vmspace *vmspace; vm_offset_t vmaddr; /* * Set file/virtual offset based on a.out variant. We do two cases: * host byte order and network byte order (for NetBSD compatibility) */ switch ((int) (gz->a_out.a_magic & 0xffff)) { case ZMAGIC: gz->virtual_offset = 0; if (gz->a_out.a_text) { gz->file_offset = PAGE_SIZE; } else { /* Bill's "screwball mode" */ gz->file_offset = 0; } break; case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: /* NetBSD compatibility */ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: gz->virtual_offset = PAGE_SIZE; gz->file_offset = 0; break; default: gz->where = __LINE__; return (-1); } } gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); /* * Check various fields in header for validity/bounds. */ if ( /* entry point must lay with text region */ gz->a_out.a_entry < gz->virtual_offset || gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || /* text and data size must each be page rounded */ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { gz->where = __LINE__; return (-1); } /* * text/data/bss must not exceed limits */ PROC_LOCK(gz->ip->proc); if ( /* text can't exceed maximum text size */ gz->a_out.a_text > maxtsiz || /* data + bss can't exceed rlimit */ gz->a_out.a_data + gz->bss_size > lim_cur(gz->ip->proc, RLIMIT_DATA)) { PROC_UNLOCK(gz->ip->proc); gz->where = __LINE__; return (ENOMEM); } PROC_UNLOCK(gz->ip->proc); /* Find out how far we should go */ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; /* + * Avoid a possible deadlock if the current address space is destroyed + * and that address space maps the locked vnode. In the common case, + * the locked vnode's v_usecount is decremented but remains greater + * than zero. Consequently, the vnode lock is not needed by vrele(). + * However, in cases where the vnode lock is external, such as nullfs, + * v_usecount may become zero. + */ + VOP_UNLOCK(gz->ip->vp, 0, td); + + /* * Destroy old process VM and create a new one (with a new stack) */ exec_new_vmspace(gz->ip, &aout_sysvec); + + vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td); vmspace = gz->ip->proc->p_vmspace; vmaddr = gz->virtual_offset; error = vm_mmap(&vmspace->vm_map, &vmaddr, gz->a_out.a_text + gz->a_out.a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, OBJT_DEFAULT, NULL, 0); if (error) { gz->where = __LINE__; return (error); } if (gz->bss_size != 0) { /* * Allocate demand-zeroed area for uninitialized data. * "bss" = 'block started by symbol' - named after the * IBM 7090 instruction of the same name. */ vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data; error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { gz->where = __LINE__; return (error); } } /* Fill in process VM information */ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; vmspace->vm_daddr = (caddr_t) (uintptr_t) (gz->virtual_offset + gz->a_out.a_text); /* Fill in image_params */ gz->ip->interpreted = 0; gz->ip->entry_addr = gz->a_out.a_entry; gz->ip->proc->p_sysent = &aout_sysvec; return 0; } static int NextByte(void *vp) { int error; struct imgact_gzip *igz = (struct imgact_gzip *) vp; if (igz->idx >= igz->len) { igz->where = __LINE__; return GZ_EOF; } if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { return igz->inbuf[(igz->idx++) - igz->offset]; } if (igz->inbuf) { error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, (vm_offset_t) igz->inbuf + PAGE_SIZE); if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } } igz->offset = igz->idx & ~PAGE_MASK; error = vm_mmap(kernel_map, /* map */ (vm_offset_t *) & igz->inbuf, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ OBJT_VNODE, /* handle type */ igz->ip->vp, /* vnode */ igz->offset); /* offset */ if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } return igz->inbuf[(igz->idx++) - igz->offset]; } static int Flush(void *vp, u_char * ptr, u_long siz) { struct imgact_gzip *gz = (struct imgact_gzip *) vp; u_char *p = ptr, *q; int i; /* First, find an a.out-header. */ if (gz->output < sizeof gz->a_out) { q = (u_char *) & gz->a_out; i = min(siz, sizeof gz->a_out - gz->output); bcopy(p, q + gz->output, i); gz->output += i; p += i; siz -= i; if (gz->output == sizeof gz->a_out) { gz->gotheader = 1; i = do_aout_hdr(gz); if (i == -1) { if (!gz->where) gz->where = __LINE__; gz->error = ENOEXEC; return ENOEXEC; } else if (i) { gz->where = __LINE__; gz->error = i; return ENOEXEC; } if (gz->file_offset == 0) { q = (u_char *) (uintptr_t) gz->virtual_offset; copyout(&gz->a_out, q, sizeof gz->a_out); } } } /* Skip over zero-padded first PAGE if needed */ if (gz->output < gz->file_offset && gz->output + siz > gz->file_offset) { i = min(siz, gz->file_offset - gz->output); gz->output += i; p += i; siz -= i; } if (gz->output >= gz->file_offset && gz->output < gz->file_end) { i = min(siz, gz->file_end - gz->output); q = (u_char *) (uintptr_t) (gz->virtual_offset + gz->output - gz->file_offset); copyout(p, q, i); gz->output += i; p += i; siz -= i; } gz->output += siz; return 0; } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; EXEC_SET(execgzip, gzip_execsw); Index: stable/6/sys/vm/vm_extern.h =================================================================== --- stable/6/sys/vm/vm_extern.h (revision 156759) +++ stable/6/sys/vm/vm_extern.h (revision 156760) @@ -1,96 +1,98 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 * $FreeBSD$ */ #ifndef _VM_EXTERN_H_ #define _VM_EXTERN_H_ struct buf; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; #ifdef _KERNEL #ifdef TYPEDEF_FOR_UAP int getpagesize(struct thread *, void *, int *); int madvise(struct thread *, void *, int *); int mincore(struct thread *, void *, int *); int mprotect(struct thread *, void *, int *); int msync(struct thread *, void *, int *); int munmap(struct thread *, void *, int *); int obreak(struct thread *, void *, int *); int sbrk(struct thread *, void *, int *); int sstk(struct thread *, void *, int *); int swapon(struct thread *, void *, int *); #endif /* TYPEDEF_FOR_UAP */ int kernacc(void *, int, int); vm_offset_t kmem_alloc(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t); void kmem_free(vm_map_t, vm_offset_t, vm_size_t); void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); void kmem_init(vm_offset_t, vm_offset_t); vm_offset_t kmem_malloc(vm_map_t, vm_size_t, boolean_t); vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t); void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); void vm_forkproc(struct thread *, struct proc *, struct thread *, int); void vm_waitproc(struct proc *); int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); void vm_set_page_size(void); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t); struct vmspace *vmspace_fork(struct vmspace *); void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); void vmspace_unshare(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); void vm_object_print(/* db_expr_t */ long, boolean_t, /* db_expr_t */ long, char *); int vm_fault_quick(caddr_t v, int prot); +struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset); +void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); void vm_thread_dispose_altkstack(struct thread *td); void vm_thread_new(struct thread *td, int pages); void vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: stable/6/sys/vm/vm_glue.c =================================================================== --- stable/6/sys/vm/vm_glue.c (revision 156759) +++ stable/6/sys/vm/vm_glue.c (revision 156760) @@ -1,947 +1,1019 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int maxslp; /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits(void *); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler(void *); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL) #ifndef NO_SWAPPING static void swapout(struct proc *); #endif static volatile int proc0_rescan; /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > kernel_map->max_offset || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjuction with this call. */ int useracc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); PROC_LOCK(curproc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) > lim_cur(curproc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(curproc); return (ENOMEM); } PROC_UNLOCK(curproc); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); +} + +/* + * Pin the page contained within the given object at the given offset. If the + * page is not resident, allocate and load it using the given object's pager. + * Return the pinned page if successful; otherwise, return NULL. + */ +static vm_page_t +vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) +{ + vm_page_t m, ma[1]; + vm_pindex_t pindex; + int rv; + + VM_OBJECT_LOCK(object); + pindex = OFF_TO_IDX(offset); + m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { + ma[0] = m; + rv = vm_pager_get_pages(object, ma, 1, 0); + m = vm_page_lookup(object, pindex); + if (m == NULL) + goto out; + if (m->valid == 0 || rv != VM_PAGER_OK) { + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + m = NULL; + goto out; + } + } + vm_page_lock_queues(); + vm_page_hold(m); + vm_page_wakeup(m); + vm_page_unlock_queues(); +out: + VM_OBJECT_UNLOCK(object); + return (m); +} + +/* + * Return a CPU private mapping to the page at the given offset within the + * given object. The page is pinned before it is mapped. + */ +struct sf_buf * +vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) +{ + vm_page_t m; + + m = vm_imgact_hold_page(object, offset); + if (m == NULL) + return (NULL); + sched_pin(); + return (sf_buf_alloc(m, SFB_CPUPRIVATE)); +} + +/* + * Destroy the given CPU private mapping and unpin the page that it mapped. + */ +void +vm_imgact_unmap_page(struct sf_buf *sf) +{ + vm_page_t m; + + m = sf_buf_page(sf); + sf_buf_free(sf); + sched_unpin(); + vm_page_lock_queues(); + vm_page_unhold(m); + vm_page_unlock_queues(); } #ifndef KSTACK_MAX_PAGES #define KSTACK_MAX_PAGES 32 #endif /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ void vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i; /* Bounds check */ if (pages <= 1) pages = KSTACK_PAGES; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); td->td_kstack_obj = ksobj; /* * Get a kernel virtual address for this thread's kstack. */ ks = kmem_alloc_nofault(kernel_map, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); if (ks == 0) panic("vm_thread_new: kstack allocation failed"); if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { /* * Get a kernel stack page. */ m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; m->valid = VM_PAGE_BITS_ALL; } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(ks, ma, pages); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m; int i, pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; pmap_qremove(ks, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock_queues(); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); vm_object_deallocate(ksobj); kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Allow a thread's kernel stack to be paged out. */ void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_lock_queues(); vm_page_dirty(m); vm_page_unwire(m, 0); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } ma[i] = m; vm_page_lock_queues(); vm_page_wire(m); vm_page_wakeup(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } /* * Set up a variable-sized alternate kstack. */ void vm_thread_new_altkstack(struct thread *td, int pages) { td->td_altkstack = td->td_kstack; td->td_altkstack_obj = td->td_kstack_obj; td->td_altkstack_pages = td->td_kstack_pages; vm_thread_new(td, pages); } /* * Restore the original kstack. */ void vm_thread_dispose_altkstack(struct thread *td) { vm_thread_dispose(td); td->td_kstack = td->td_altkstack; td->td_kstack_obj = td->td_altkstack_obj; td->td_kstack_pages = td->td_altkstack_pages; td->td_altkstack = 0; td->td_altkstack_obj = NULL; td->td_altkstack_pages = 0; } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ void vm_forkproc(td, p2, td2, flags) struct thread *td; struct proc *p2; struct thread *td2; int flags; { struct proc *p1 = td->td_proc; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { vmspace_unshare(p1); } } cpu_fork(td, p2, td2, flags); return; } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); } /* * Called after process has been wait(2)'ed apon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { struct proc *p = udata; struct plimit *limp; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ limp = p->p_limit; limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } void faultin(p) struct proc *p; { #ifdef NO_SWAPPING PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_sflag & PS_INMEM) == 0) panic("faultin: proc swapped out with NO_SWAPPING!"); #else /* !NO_SWAPPING */ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_sflag & PS_SWAPPINGIN) msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0); else if ((p->p_sflag & PS_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; mtx_lock_spin(&sched_lock); p->p_sflag |= PS_SWAPPINGIN; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGIN; p->p_sflag |= PS_INMEM; FOREACH_THREAD_IN_PROC(p, td) { TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); } mtx_unlock_spin(&sched_lock); wakeup(&p->p_sflag); /* Allow other threads to swap p out now. */ --p->p_lock; } #endif /* NO_SWAPPING */ } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * XXXKSE - process with the thread with highest priority counts.. * * Giant is held on entry. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { struct proc *p; struct thread *td; int pri; struct proc *pp; int ppri; mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); loop: if (vm_page_count_min()) { VM_WAIT; mtx_lock_spin(&sched_lock); proc0_rescan = 0; mtx_unlock_spin(&sched_lock); goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct ksegrp *kg; if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { continue; } mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ if (td->td_inhibitors == TDI_SWAPPED) { kg = td->td_ksegrp; pri = p->p_swtime + kg->kg_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { pri -= p->p_nice * 8; } /* * if this ksegrp is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } } mtx_unlock_spin(&sched_lock); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { mtx_lock_spin(&sched_lock); if (!proc0_rescan) { TD_SET_IWAIT(&thread0); mi_switch(SW_VOL, NULL); } proc0_rescan = 0; mtx_unlock_spin(&sched_lock); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); proc0_rescan = 0; mtx_unlock_spin(&sched_lock); goto loop; } mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPINREQ; mtx_unlock_spin(&sched_lock); /* * We would like to bring someone in. (only if there is space). * [What checks the space? ] */ faultin(p); PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); p->p_swtime = 0; proc0_rescan = 0; mtx_unlock_spin(&sched_lock); goto loop; } void kick_proc0(void) { struct thread *td = &thread0; if (TD_AWAITING_INTR(td)) { CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, 0); TD_CLR_IWAIT(td); setrunqueue(td, SRQ_INTR); } else { proc0_rescan = 1; CTR2(KTR_INTR, "%s: state %d", __func__, td->td_state); } } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs(action) int action; { struct proc *p; struct thread *td; struct ksegrp *kg; int didswap = 0; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct vmspace *vm; int minslptime = 100000; /* * Watch out for a process in * creation. It may have no * address space or lock yet. */ mtx_lock_spin(&sched_lock); if (p->p_state == PRS_NEW) { mtx_unlock_spin(&sched_lock); continue; } mtx_unlock_spin(&sched_lock); /* * An aio daemon switches its * address space while running. * Perform a quick check whether * a process has P_SYSTEM. */ if ((p->p_flag & P_SYSTEM) != 0) continue; /* * Do not swapout a process that * is waiting for VM data * structures as there is a possible * deadlock. Test this first as * this may block. * * Lock the map until swapout * finishes, or a thread of this * process may attempt to alter * the map. */ PROC_LOCK(p); vm = p->p_vmspace; KASSERT(vm != NULL, ("swapout_procs: a process has no address space")); atomic_add_int(&vm->vm_refcnt, 1); PROC_UNLOCK(p); if (!vm_map_trylock(&vm->vm_map)) goto nextproc1; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) ) != 0) { goto nextproc2; } /* * only aiod changes vmspace, however it will be * skipped because of the if statement above checking * for P_SYSTEM */ if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM) goto nextproc2; switch (p->p_state) { default: /* Don't swap out processes in any sort * of 'special' state. */ break; case PRS_NORMAL: mtx_lock_spin(&sched_lock); /* * do not swapout a realtime process * Check all the thread groups.. */ FOREACH_KSEGRP_IN_PROC(p, kg) { if (PRI_IS_REALTIME(kg->kg_pri_class)) goto nextproc; /* * Guarantee swap_idle_threshold1 * time in memory. */ if (kg->kg_slptime < swap_idle_threshold1) goto nextproc; /* * Do not swapout a process if it is * waiting on a critical event of some * kind or there is a thread whose * pageable memory may be accessed. * * This could be refined to support * swapping out a thread. */ FOREACH_THREAD_IN_GROUP(kg, td) { if ((td->td_priority) < PSOCK || !thread_safetoswapout(td)) goto nextproc; } /* * If the system is under memory stress, * or if we are swapping * idle processes >= swap_idle_threshold2, * then swap the process out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (kg->kg_slptime < swap_idle_threshold2))) goto nextproc; if (minslptime > kg->kg_slptime) minslptime = kg->kg_slptime; } /* * If the pageout daemon didn't free enough pages, * or if this process is idle and the system is * configured to swap proactively, swap it out. */ if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (minslptime > swap_idle_threshold2))) { swapout(p); didswap++; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); sx_sunlock(&allproc_lock); goto retry; } nextproc: mtx_unlock_spin(&sched_lock); } nextproc2: PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); nextproc1: vmspace_free(vm); continue; } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapout(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM, ("swapout: lost a swapout race?")); #if defined(INVARIANTS) /* * Make sure that all threads are safe to be swapped out. * * Alternatively, we could swap out only safe threads. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(thread_safetoswapout(td), ("swapout: there is a thread not safe for swapout")); } #endif /* INVARIANTS */ ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPINGOUT; PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) TD_SET_SWAPPED(td); mtx_unlock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGOUT; p->p_swtime = 0; } #endif /* !NO_SWAPPING */