Index: stand/common/load_elf_obj.c =================================================================== --- stand/common/load_elf_obj.c +++ stand/common/load_elf_obj.c @@ -40,6 +40,8 @@ #define FREEBSD_ELF #include +#include + #include "bootstrap.h" #define COPYOUT(s,d,l) archsw.arch_copyout((vm_offset_t)(s), d, l) @@ -191,6 +193,7 @@ Elf_Ehdr *hdr; Elf_Shdr *shdr, *cshdr, *lshdr; vm_offset_t firstaddr, lastaddr; + vm_prot_t lastprot, prot; int i, nsym, res, ret, shdrbytes, symstrindex; ret = 0; @@ -215,6 +218,7 @@ */ for (i = 0; i < hdr->e_shnum; i++) shdr[i].sh_addr = 0; + lastprot = VM_PROT_NONE; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; @@ -226,6 +230,11 @@ #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; + prot = link_elf_flags_to_prot(&shdr[i]); + if (prot != lastprot) { + lastprot = prot; + lastaddr = roundup2(lastaddr, PAGE_SIZE); + } lastaddr = roundup(lastaddr, shdr[i].sh_addralign); shdr[i].sh_addr = (Elf_Addr)lastaddr; lastaddr += shdr[i].sh_size; @@ -290,6 +299,9 @@ break; } } + + /* End on a page boundary. */ + lastaddr = roundup2(lastaddr, PAGE_SIZE); /* Clear the whole area, including bss regions. */ kern_bzero(firstaddr, lastaddr - firstaddr); Index: sys/kern/link_elf_obj.c =================================================================== --- sys/kern/link_elf_obj.c +++ sys/kern/link_elf_obj.c @@ -196,6 +196,31 @@ SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, NULL); +static void +link_elf_set_prot(vm_offset_t start, vm_offset_t end, vm_prot_t prot, + const char *filename, bool vm_map) +{ + + /* + * Only call the protection functions if there is something + * to change. + */ + if (start < end && prot != VM_PROT_ALL) { +#ifdef DEBUG_ELF_LINKING + printf("%s: %p/%lx (%lx)\n", __func__, + (void *)start, (u_long)(end - start), (u_long)prot); +#endif + if (vm_map && + vm_map_protect(kernel_map, start, end, prot, true) != + KERN_SUCCESS) + printf("%s: unable to protect %lx bytes at 0x%lx\n", + filename != NULL ? filename : "", + (u_long)(end - start), (u_long)start); + else if (!vm_map) + pmap_protect(kernel_pmap, start, end, prot); + } +} + static int link_elf_link_preload(linker_class_t cls, const char *filename, linker_file_t *result) @@ -240,6 +265,9 @@ ef->address = *(caddr_t *)baseptr; lf->address = *(caddr_t *)baseptr; lf->size = *(size_t *)sizeptr; +#ifdef DEBUG_ELF_LINKING + printf("%s: %p/%lx\n", filename, (void *)lf->address, (u_long)lf->size); +#endif if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || @@ -362,6 +390,12 @@ if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; +#ifdef DEBUG_ELF_LINKING + printf("%s: %d: %s: %p/%lx (%lx)\n", filename, pb, + ef->progtab[pb].name, ef->progtab[pb].addr, + (u_long)ef->progtab[pb].size, + (u_long)link_elf_flags_to_prot(&shdr[i])); +#endif if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) { void *dpcpu; @@ -474,13 +508,228 @@ link_elf_link_preload_finish(linker_file_t lf) { elf_file_t ef; - int error; + vm_offset_t protend, protstart, rangeend, vaddr; + vm_prot_t lastprot, prot; + int error, i; ef = (elf_file_t)lf; error = relocate_file(ef); if (error) return error; + KASSERT(ef->progtab != NULL && ef->nprogtab > 0, + ("%s: Invalid progtab (ef=%p)", __func__, ef)); + if (ef->progtab == NULL || ef->nprogtab == 0) + return (EINVAL); + + /* + * Set protections for the kernel memory allocated by the loader. + * Note that this is slightly tricky because we can't guarantee + * that the loader did what we "expected" (sections with different + * permissions don't share a page). Therefore, we need to deal + * with the possibility that pages may have multiple sections with + * different permission requirements, and we need to use the + * union of necessary permissions. + * + * Note that we will assume read-only permissions for DPCPU and + * VNET sections. The data doesn't actually need to be mapped + * at this point, but there is no point in setting VM_PROT_NONE + * unless/until we will actually free the underlying pages. In + * the meantime, using VM_PROT_READ for those sections may allow + * the loader to coalesce them with adjacent read-only sections. + * + * One final note: code here should be kept in sync with the + * loader code (currently in stand/common/load_elf_obj.c). + * This code should be able to deal with variations from what + * it expects, but it would be best if the two parts of the + * system aligned. + */ + protend = (vm_offset_t)lf->address; + protstart = trunc_page(protend); + lastprot = (protend & PAGE_MASK) == 0 ? VM_PROT_NONE : VM_PROT_ALL; + rangeend = protend + lf->size; + for (i = 0; i < ef->nprogtab; i++) { + if (ef->progtab[i].size == 0) + continue; + + /* Skip VNET and DPCPU sections. */ + if (ef->progtab[i].name != NULL && ( +#ifdef VIMAGE + !strcmp(ef->progtab[i].name, VNET_SETNAME) || +#endif + !strcmp(ef->progtab[i].name, DPCPU_SETNAME))) + continue; + + /* Verify the allocation is within the range we expected. */ + vaddr = (vm_offset_t)ef->progtab[i].addr; + KASSERT(vaddr >= protend && + vaddr + ef->progtab[i].size <= rangeend, + ("%s: progtab[%d].addr (%p, %lu) outside range " + "(%p-%p) (ef=%p)", __func__, i, ef->progtab[i].addr, + (u_long)ef->progtab[i].size, (void *)protend, + (void *)rangeend, ef)); + + /* Determine protections for this section. */ + prot = link_elf_flags_to_prot( + &ef->e_shdr[ef->progtab[i].sec]); + + /* + * Find "holes" (such as those caused by VNET or DPCPU + * sections) and protect them as VM_PROT_READ. + * Note that we don't need to do anything special here + * if the previous range was VM_PROT_READ. Instead, the + * range will get auto-extended below. + */ + if (lastprot != VM_PROT_READ && + round_page(protend) < trunc_page(vaddr)) { + KASSERT((protstart & PAGE_MASK) == 0, + ("%s: protstart (%p) is not page-aligned " + "(filling hole)", __func__, (void *)protstart)); + + /* Protect previous section. */ + protend = round_page(protend); + link_elf_set_prot(protstart, protend, lastprot, + lf->filename, false); + + /* + * Record the VM_PROT_READ "hole". If one exists, + * we'll either merge it into the next section or + * protect it separately below. If there is no + * longer a "hole", the code below will functionally + * discard it. + */ + protstart = protend; + protend = trunc_page(vaddr); + lastprot = VM_PROT_READ; + } + + /* + * If the protections are different from the last section, + * set the protections appropriately. + * + * Store the current protections in lastprot and the start + * and end of the range covered by that protection in + * protstart and protend. + */ + KASSERT((protstart & PAGE_MASK) == 0, + ("%s: protstart (%p) is not page-aligned", __func__, + (void *)protstart)); + KASSERT(protstart <= protend, + ("%s: protstart > protend (%p > %p)", + __func__, (void *)protstart, (void *)protend)); + KASSERT(lastprot == VM_PROT_READ || + trunc_page(vaddr) == trunc_page(protend) || + trunc_page(vaddr) == round_page(protend), + ("%s: Unexpected hole (protstart=%p, protend=%p, vaddr=%p)", + __func__, (void *)protstart, (void *)protend, + (void *)vaddr)); + + /* If the protection is the same, extend the existing range. */ + if (prot == lastprot) { + protend = vaddr + ef->progtab[i].size; + continue; + } + + /* + * Do protend and the current range share a page? + * If so, we can only change the protections on + * the pages that protstart - protend completely + * use. For the page which the previous protection + * area shares with the current page, we need to + * use the union of the needed permissions. If the + * new range extends beyond the page shared with the + * previous range, the new range can use the correct + * permissions. + * + * So, this leads to up to three ranges: + * 1. The portion of the previous range which is in + * unique pages. + * 2. A page shared by the previous range and the current + * range. + * 3. The portion of the current range which is in unique + * pages. + * + * Note that not all of the above three ranges need + * to exist. + */ + KASSERT(vaddr >= protend, ("%s: vaddr < protend (%p < %p)", + __func__, (void *)vaddr, (void *)protend)); + if ((protend & PAGE_MASK) != 0 && + trunc_page(protend) == trunc_page(vaddr)) { + /* Deal with range 1 (see comment). */ + if (protstart < trunc_page(protend)) { + link_elf_set_prot(protstart, + trunc_page(protend), lastprot, + lf->filename, false); + protstart = trunc_page(protend); + } + + /* Deal with range 2 (see comment). */ + KASSERT(protend - protstart < PAGE_SIZE, + ("%s: Unexpectedly large remaining range after " + "trimming unique pages (protstart=%p, protend=%p)", + __func__, (void *)protstart, (void *)protend)); + KASSERT(protstart == trunc_page(vaddr), + ("%s: Page doesn't span protection ranges after " + "trimming (protstart=%p, protend=%p, vaddr=%p)", + __func__, (void *)protstart, (void *)protend, + (void *)vaddr)); + lastprot |= prot; + if (vaddr + ef->progtab[i].size > + protstart + PAGE_SIZE) { + /* + * Current range extends beyond on the + * range 2 page. So, set the permissions + * for the range 2 page and move on. + * So, simply record this range as the + * current range (with the shared + * protections) and move on to range 3. + */ + link_elf_set_prot(protstart, + protstart + PAGE_SIZE, lastprot, + lf->filename, false); + protstart += PAGE_SIZE; + lastprot = prot; + } + /* Record the end of the current range. */ + protend = vaddr + ef->progtab[i].size; + continue; + } + + /* + * Ranges do not share pages. Set permissions for + * previous range and start a new one. + */ + link_elf_set_prot(protstart, trunc_page(vaddr), lastprot, + lf->filename, false); + lastprot = prot; + protstart = trunc_page(vaddr); + protend = vaddr + ef->progtab[i].size; + } + /* + * Set permissions for the last range. Again, this is + * complicated by not knowing what the bootloader has done and/or + * the possibility of "holes". + * + * If the last range ends on a page prior to the end of the + * overall allocation, we treat the rest of the allocation + * as a "hole" and give it VM_PROT_READ permissions. + * + * If the overall allocation ends mid-page, we assume the last + * page may need all permissions, so we leave it alone. + */ + rangeend = trunc_page(rangeend); + if (protend > rangeend) + protend = rangeend; + else + protend = round_page(protend); + if (protstart < protend) { + link_elf_set_prot(protstart, protend, lastprot, + lf->filename, false); + link_elf_set_prot(protend, rangeend, VM_PROT_READ, + lf->filename, false); + } + /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) @@ -501,7 +750,8 @@ Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; - vm_offset_t mapbase; + vm_offset_t mapbase, protstart; + vm_prot_t lastprot, prot; size_t mapsize; int error = 0; ssize_t resid; @@ -739,6 +989,8 @@ /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; + lastprot = VM_PROT_NONE; + pb = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; @@ -750,6 +1002,44 @@ #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; + /* + * Determine the names, which lets us know + * whether we need to use mapped space for these. + */ + if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { + ef->progtab[pb].name = + ef->shstrtab + shdr[i].sh_name; + } else if (shdr[i].sh_type == SHT_PROGBITS) + ef->progtab[pb].name = "<>"; +#ifdef __amd64__ + else if (shdr[i].sh_type == SHT_X86_64_UNWIND) + ef->progtab[pb].name = "<>"; +#endif + else + ef->progtab[pb].name = "<>"; + ef->progtab[pb].sec = i; + /* + * We don't use mapped space for DPCPU or VNET + * sections. + */ + if (ef->progtab[pb].name != NULL && ( +#ifdef VIMAGE + !strcmp(ef->progtab[pb].name, VNET_SETNAME) || +#endif + !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))) { + pb++; + break; + } + pb++; + /* + * If the protections will be different from the + * last section, round up to a page boundary. + */ + if ((prot = link_elf_flags_to_prot(&shdr[i])) != + lastprot) { + lastprot = prot; + mapsize = roundup2(mapsize, PAGE_SIZE); + } alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; @@ -757,6 +1047,13 @@ break; } } + mapsize = roundup2(mapsize, PAGE_SIZE); + if (pb != ef->nprogtab) { + link_elf_error(filename, + "lost progbits (while getting mapsize)"); + error = ENOEXEC; + goto out; + } /* * We know how much space we need for the text/data/bss/etc. @@ -811,6 +1108,7 @@ rl = 0; ra = 0; alignmask = 0; + lastprot = VM_PROT_NONE; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; @@ -822,24 +1120,6 @@ #endif if ((shdr[i].sh_flags & SHF_ALLOC) == 0) break; - alignmask = shdr[i].sh_addralign - 1; - mapbase += alignmask; - mapbase &= ~alignmask; - if (ef->shstrtab != NULL && shdr[i].sh_name != 0) { - ef->progtab[pb].name = - ef->shstrtab + shdr[i].sh_name; - if (!strcmp(ef->progtab[pb].name, ".ctors")) { - lf->ctors_addr = (caddr_t)mapbase; - lf->ctors_size = shdr[i].sh_size; - } - } else if (shdr[i].sh_type == SHT_PROGBITS) - ef->progtab[pb].name = "<>"; -#ifdef __amd64__ - else if (shdr[i].sh_type == SHT_X86_64_UNWIND) - ef->progtab[pb].name = "<>"; -#endif - else - ef->progtab[pb].name = "<>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) ef->progtab[pb].addr = @@ -850,15 +1130,38 @@ ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif - else + else { + /* + * Figure out whether this has the same + * protection requirements as the previous + * section using mapbase. If so, they can + * share the same page. + */ + if ((prot = link_elf_flags_to_prot(&shdr[i])) != + lastprot) { + mapbase = roundup2(mapbase, PAGE_SIZE); + lastprot = prot; + } + /* Respect section alignment. */ + alignmask = shdr[i].sh_addralign - 1; + mapbase += alignmask; + mapbase &= ~alignmask; + if (ef->progtab[pb].name != NULL && + !strcmp(ef->progtab[pb].name, ".ctors")) { + lf->ctors_addr = (caddr_t)mapbase; + lf->ctors_size = shdr[i].sh_size; + } ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; + } if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; - ef->progtab[pb].sec = i; + KASSERT(ef->progtab[pb].sec == i, + ("%s: pb %d section number changed from %d to %d", + __func__, pb, ef->progtab[pb].sec, i)); if (shdr[i].sh_type == SHT_PROGBITS #ifdef __amd64__ || shdr[i].sh_type == SHT_X86_64_UNWIND @@ -943,6 +1246,7 @@ break; } } + mapbase = roundup2(mapbase, PAGE_SIZE); if (pb != ef->nprogtab) { link_elf_error(filename, "lost progbits"); error = ENOEXEC; @@ -985,6 +1289,61 @@ if (error) goto out; + /* Now that everything is done, set protections. */ + alignmask = 0; + lastprot = VM_PROT_NONE; + protstart = (vm_offset_t)ef->address; + for (pb = 0; pb < ef->nprogtab; pb++) { + KASSERT(ef->progtab[pb].size > 0, + ("%s: section %d size is unexpectedly 0", + __func__, ef->progtab[pb].sec)); + /* + * We don't use mapped space for DPCPU or VNET + * sections. + */ + if (ef->progtab[pb].name != NULL && ( +#ifdef VIMAGE + !strcmp(ef->progtab[pb].name, VNET_SETNAME) || +#endif + !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))) { + continue; + } + KASSERT((vm_offset_t)ef->progtab[pb].addr >= protstart && + ((vm_offset_t)ef->progtab[pb].addr) + + ef->progtab[pb].size <= mapbase, + ("%s: progtab[%d].addr (%p, %lu) outside range " + "(%p-%p) (ef=%p)", __func__, pb, + ef->progtab[pb].addr, (u_long)ef->progtab[pb].size, + (void *)protstart, (void *)mapbase, ef)); + /* + * If the protections will be different from the + * last section, set the protection appropriately. + */ + prot = link_elf_flags_to_prot(&shdr[ef->progtab[pb].sec]); + if (prot != lastprot) { + KASSERT(((uintptr_t)ef->progtab[pb].addr & + PAGE_MASK) == 0, + ("%s: Address of new protection area " + "does not start on a page boundary " + "(addr=%p, ef=%p, pb=%d)", + __func__, ef->progtab[pb].addr, ef, pb)); + link_elf_set_prot(protstart, + (vm_offset_t)ef->progtab[pb].addr, + lastprot, filename, true); + lastprot = prot; + protstart = (vm_offset_t)ef->progtab[pb].addr; + } + KASSERT(pb != (ef->nprogtab -1) || + roundup2(((vm_offset_t)ef->progtab[pb].addr) + + ef->progtab[pb].size, PAGE_SIZE) == mapbase, + ("%s: The last section's address space does not " + "end where expected (ef=%p, progtab[%d].addr " + "(%p, %lu), mapbase=%p", __func__, ef, pb, + ef->progtab[pb].addr, (u_long)ef->progtab[pb].size, + (void *)mapbase)); + } + link_elf_set_prot(protstart, mapbase, lastprot, filename, true); + /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) @@ -1041,6 +1400,10 @@ if (file->filename != NULL) preload_delete_name(file->filename); /* XXX reclaim module memory? */ + /* For now, make the memory inaccessible. */ + link_elf_set_prot(round_page((vm_offset_t)file->address), + trunc_page(((vm_offset_t)file->address) + file->size), + VM_PROT_NONE, file->filename, false); return; } Index: sys/sys/link_elf.h =================================================================== --- sys/sys/link_elf.h +++ sys/sys/link_elf.h @@ -90,6 +90,12 @@ void *dlpi_tls_data; }; +#define link_elf_flags_to_prot(shdr) \ + (((shdr)->sh_flags & SHF_EXECINSTR) != 0 ? \ + (VM_PROT_EXECUTE | VM_PROT_READ) : \ + (((shdr)->sh_flags & SHF_WRITE) != 0 ? \ + VM_PROT_RW : VM_PROT_READ)) + __BEGIN_DECLS typedef int (*__dl_iterate_hdr_callback)(struct dl_phdr_info *, size_t, void *);