Index: head/sys/kern/imgact_elf.c =================================================================== --- head/sys/kern/imgact_elf.c (revision 297390) +++ head/sys/kern/imgact_elf.c (revision 297391) @@ -1,2333 +1,2329 @@ /*- * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_gzio.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __arm__ #include #endif #define ELF_NOTE_ROUNDSIZE 4 #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int interp_name_len, int32_t *osrel); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); #define CORE_BUF_SIZE (16 * 1024) int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, ""); int __elfN(nxstack) = #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \ (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) 1; #else 0; #endif SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack"); #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) int i386_read_exec = 0; SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif #endif static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define trunc_page_ps(va, ps) ((va) & ~(ps - 1)) #define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1)) #define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a)) static const char FREEBSD_ABI_VENDOR[] = "FreeBSD"; Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = NT_FREEBSD_ABI_TAG, .vendor = FREEBSD_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = __elfN(freebsd_trans_osrel) }; static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) { uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); *osrel = *(const int32_t *)(p); return (TRUE); } static const char GNU_ABI_VENDOR[] = "GNU"; static int GNU_KFREEBSD_ABI_DESC = 3; Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = kfreebsd_trans_osrel }; static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); desc = (const Elf32_Word *)p; if (desc[0] != GNU_KFREEBSD_ABI_DESC) return (FALSE); /* * Debian GNU/kFreeBSD embed the earliest compatible kernel version * (__FreeBSD_version: Rxx) in the LSB way. */ *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3]; return (TRUE); } int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) { printf("WARNING: %s: could not insert brandinfo entry: %p\n", __func__, entry); return (-1); } return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int interp_name_len, int32_t *osrel) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; Elf_Brandinfo *bi, *bi_m; boolean_t ret; int i; /* * We support four types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, (3) path of the `interp_path' * field, and (4) the ".note.ABI-tag" ELF section. */ /* Look for an ".note.ABI-tag" ELF section */ bi_m = NULL; for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL) continue; if (hdr->e_machine == bi->machine && (bi->flags & (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) { ret = __elfN(check_note)(imgp, bi->brand_note, osrel); /* Give brand a chance to veto check_note's guess */ if (ret && bi->header_supported) ret = bi->header_supported(imgp); /* * If note checker claimed the binary, but the * interpreter path in the image does not * match default one for the brand, try to * search for other brands with the same * interpreter. Either there is better brand * with the right interpreter, or, failing * this, we return first brand which accepted * our note and, optionally, header. */ if (ret && bi_m == NULL && (strlen(bi->interp_path) + 1 != interp_name_len || strncmp(interp, bi->interp_path, interp_name_len) != 0)) { bi_m = bi; ret = 0; } if (ret) return (bi); } } if (bi_m != NULL) return (bi_m); /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0)) { /* Looks good, but give brand a chance to veto */ if (!bi->header_supported || bi->header_supported(imgp)) return (bi); } } /* No known brand, see if the header is recognized by any brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY || bi->header_supported == NULL) continue; if (hdr->e_machine == bi->machine) { ret = bi->header_supported(imgp); if (ret) return (bi); } } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && /* ELF image p_filesz includes terminating zero */ strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_lock(map); vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Find the page from the underlying object. */ if (object) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } } return (KERN_SUCCESS); } static int __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv) return (rv); end = trunc_page(end); } if (end > start) { if (offset & PAGE_MASK) { /* * The mapping is not page aligned. This means we have * to copy the data. Sigh. */ rv = vm_map_find(map, NULL, 0, &start, end - start, 0, VMFS_NO_SPACE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0); if (rv) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } offset += sz; } rv = KERN_SUCCESS; } else { vm_object_reference(object); vm_map_lock(map); rv = vm_map_insert(map, object, offset, start, end, prot, VM_PROT_ALL, cow); vm_map_unlock(map); if (rv != KERN_SUCCESS) vm_object_deallocate(object); } return (rv); } else { return (KERN_SUCCESS); } } static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { struct sf_buf *sf; size_t map_len; vm_map_t map; vm_object_t object; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_offset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } object = imgp->object; map = &imgp->proc->p_vmspace->vm_map; map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second.. */ if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) { return (0); } } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (EINVAL); } } if (copy_len != 0) { vm_offset_t off; sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error) { return (error); } } /* * set it to the specified protection. * XXX had better undo the damage from pasting over the cracks here! */ vm_map_protect(map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int error, i, numsegs; #ifdef CAPABILITY_MODE /* * XXXJA: This check can go away once we are sufficiently confident * that the checks in namei() are correct. */ if (IN_CAPABILITY_MODE(curthread)) return (ECAPMODE); #endif tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread); if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ VOP_SET_TEXT(nd->ni_vp); imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { error = ENOEXEC; goto fail; } for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) { /* Loadable segment */ prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize); if (error != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) vput(nd->ni_vp); free(tempdata, M_TEMP); return (error); } static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { struct thread *td; const Elf_Ehdr *hdr; const Elf_Phdr *phdr; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; const char *err_str, *newinterp; char *interp, *interp_buf, *path; Elf_Brandinfo *brand_info; struct sysentvec *sv; vm_prot_t prot; u_long text_size, data_size, total_size, text_addr, data_addr; u_long seg_size, seg_addr, addr, baddr, et_dyn_addr, entry, proghdr; int32_t osrel; int error, i, n, interp_name_len, have_interp; hdr = (const Elf_Ehdr *)imgp->image_header; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { /* Only support headers in first page for now */ uprintf("Program headers not in the first page\n"); return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { uprintf("Unaligned program headers\n"); return (ENOEXEC); } n = error = 0; baddr = 0; osrel = 0; text_size = data_size = total_size = text_addr = data_addr = 0; entry = proghdr = 0; interp_name_len = 0; err_str = newinterp = NULL; interp = interp_buf = NULL; td = curthread; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: if (n == 0) baddr = phdr[i].p_vaddr; n++; break; case PT_INTERP: /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN) { uprintf("Invalid PT_INTERP\n"); error = ENOEXEC; goto ret; } if (interp != NULL) { uprintf("Multiple PT_INTERP headers\n"); error = ENOEXEC; goto ret; } interp_name_len = phdr[i].p_filesz; if (phdr[i].p_offset > PAGE_SIZE || interp_name_len > PAGE_SIZE - phdr[i].p_offset) { VOP_UNLOCK(imgp->vp, 0); interp_buf = malloc(interp_name_len + 1, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_rdwr(UIO_READ, imgp->vp, interp_buf, interp_name_len, phdr[i].p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td); if (error != 0) { uprintf("i/o error PT_INTERP\n"); goto ret; } interp_buf[interp_name_len] = '\0'; interp = interp_buf; } else { interp = __DECONST(char *, imgp->image_header) + phdr[i].p_offset; } break; case PT_GNU_STACK: if (__elfN(nxstack)) imgp->stack_prot = __elfN(trans_prot)(phdr[i].p_flags); imgp->stack_sz = phdr[i].p_memsz; break; } } brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len, &osrel); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); error = ENOEXEC; goto ret; } if (hdr->e_type == ET_DYN) { if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) { uprintf("Cannot execute shared object\n"); error = ENOEXEC; goto ret; } /* * Honour the base load address from the dso if it is * non-zero for some reason. */ if (baddr == 0) et_dyn_addr = ET_DYN_LOAD_ADDR; else et_dyn_addr = 0; } else et_dyn_addr = 0; sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) newinterp = brand_info->interp_newpath; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. * * The VV_TEXT flag prevents modifications to the executable while * the vnode is unlocked. */ VOP_UNLOCK(imgp->vp, 0); error = exec_new_vmspace(imgp, sv); imgp->proc->p_sysent = sv; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) goto ret; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ if (phdr[i].p_memsz == 0) break; prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize); if (error != 0) goto ret; /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff + et_dyn_addr; seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr + et_dyn_addr - seg_addr); /* * Make the largest executable segment the official * text segment and all others data. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (phdr[i].p_flags & PF_X && text_size < seg_size) { text_size = seg_size; text_addr = seg_addr; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr + et_dyn_addr; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } entry = (u_long)hdr->e_entry + et_dyn_addr; /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA)) err_str = "Data segment size exceeds process limit"; else if (text_size > maxtsiz) err_str = "Text segment size exceeds system limit"; else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM)) err_str = "Total segment size exceeds process limit"; else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0) err_str = "Data segment size exceeds resource limit"; else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) err_str = "Total segment size exceeds resource limit"; if (err_str != NULL) { PROC_UNLOCK(imgp->proc); uprintf("%s\n", err_str); error = ENOMEM; goto ret; } vmspace = imgp->proc->p_vmspace; vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; if (interp != NULL) { have_interp = FALSE; VOP_UNLOCK(imgp->vp, 0); if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, &addr, &imgp->entry_addr, sv->sv_pagesize); free(path, M_TEMP); if (error == 0) have_interp = TRUE; } if (!have_interp && newinterp != NULL && (brand_info->interp_path == NULL || strcmp(interp, brand_info->interp_path) == 0)) { error = __elfN(load_file)(imgp->proc, newinterp, &addr, &imgp->entry_addr, sv->sv_pagesize); if (error == 0) have_interp = TRUE; } if (!have_interp) { error = __elfN(load_file)(imgp->proc, interp, &addr, &imgp->entry_addr, sv->sv_pagesize); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) { uprintf("ELF interpreter %s not found, error %d\n", interp, error); goto ret; } } else addr = et_dyn_addr; /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->hdr_eflags = hdr->e_flags; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; imgp->reloc_base = addr; imgp->proc->p_osrel = osrel; ret: free(interp_buf, M_TEMP); return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); #ifdef AT_EHDRFLAGS AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags); #endif if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp); AUXARGS_ENTRY(pos, AT_OSRELDATE, imgp->proc->p_ucred->cr_prison->pr_osreldate); if (imgp->canary != 0) { AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary); AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen); } AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus); if (imgp->pagesizes != 0) { AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes); AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen); } if (imgp->sysent->sv_timekeep_base != 0) { AUXARGS_ENTRY(pos, AT_TIMEKEEP, imgp->sysent->sv_timekeep_base); } AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : imgp->sysent->sv_stackprot); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; typedef void (*outfunc_t)(void *, struct sbuf *, size_t *); struct note_info { int type; /* Note type. */ outfunc_t outfunc; /* Output function. */ void *outarg; /* Argument for the output function. */ size_t outsize; /* Output size. */ TAILQ_ENTRY(note_info) link; /* Link to the next note info. */ }; TAILQ_HEAD(note_info_list, note_info); /* Coredump output parameters. */ struct coredump_params { off_t offset; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; struct vnode *vp; struct gzio_stream *gzs; }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static int core_write(struct coredump_params *, void *, size_t, off_t, enum uio_seg); static void each_writable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t, struct note_info_list *, size_t); static void __elfN(prepare_notes)(struct thread *, struct note_info_list *, size_t *); static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t); static void __elfN(putnote)(struct note_info *, struct sbuf *); static size_t register_note(struct note_info_list *, int, outfunc_t, void *); static int sbuf_drain_core_output(void *, const char *, int); static int sbuf_drain_count(void *arg, const char *data, int len); static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *); static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *); static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *); static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *); static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *); static void note_procstat_files(void *, struct sbuf *, size_t *); static void note_procstat_groups(void *, struct sbuf *, size_t *); static void note_procstat_osrel(void *, struct sbuf *, size_t *); static void note_procstat_rlimit(void *, struct sbuf *, size_t *); static void note_procstat_umask(void *, struct sbuf *, size_t *); static void note_procstat_vmmap(void *, struct sbuf *, size_t *); #ifdef GZIO extern int compress_user_cores_gzlevel; /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len) { u_int chunk_len; int error; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); copyin(base, buf, chunk_len); error = gzio_write(p->gzs, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } static int core_gz_write(void *base, size_t len, off_t offset, void *arg) { return (core_write((struct coredump_params *)arg, base, len, offset, UIO_SYSSPACE)); } #endif /* GZIO */ static int core_write(struct coredump_params *p, void *base, size_t len, off_t offset, enum uio_seg seg) { return (vn_rdwr_inchunks(UIO_WRITE, p->vp, base, len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, p->active_cred, p->file_cred, NULL, p->td)); } static int core_output(void *base, size_t len, off_t offset, struct coredump_params *p, void *tmpbuf) { #ifdef GZIO if (p->gzs != NULL) return (compress_chunk(p, base, tmpbuf, len)); #endif return (core_write(p, base, len, offset, UIO_USERSPACE)); } /* * Drain into a core file. */ static int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *p; int error, locked; p = (struct coredump_params *)arg; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p->td->td_proc); if (locked) PROC_UNLOCK(p->td->td_proc); #ifdef GZIO if (p->gzs != NULL) error = gzio_write(p->gzs, __DECONST(char *, data), len); else #endif error = core_write(p, __DECONST(void *, data), len, p->offset, UIO_SYSSPACE); if (locked) PROC_LOCK(p->td->td_proc); if (error != 0) return (-error); p->offset += len; return (len); } /* * Drain into a counter. */ static int sbuf_drain_count(void *arg, const char *data __unused, int len) { size_t *sizep; sizep = (size_t *)arg; *sizep += len; return (len); } int __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; struct note_info_list notelst; struct coredump_params params; struct note_info *ninfo; void *hdr, *tmpbuf; size_t hdrsize, notesz, coresize; #ifdef GZIO boolean_t compress; compress = (flags & IMGACT_CORE_COMPRESS) != 0; #endif hdr = NULL; tmpbuf = NULL; TAILQ_INIT(¬elst); /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_writable_segment(td, cb_size_segment, &seginfo); /* * Collect info about the core file header area. */ hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count); __elfN(prepare_notes)(td, ¬elst, ¬esz); coresize = round_page(hdrsize + notesz) + seginfo.size; /* Set up core dump parameters. */ params.offset = 0; params.active_cred = cred; params.file_cred = NOCRED; params.td = td; params.vp = vp; params.gzs = NULL; #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_CORE, coresize); PROC_UNLOCK(td->td_proc); if (error != 0) { error = EFAULT; goto done; } } #endif if (coresize >= limit) { error = EFAULT; goto done; } #ifdef GZIO /* Create a compression stream if necessary. */ if (compress) { params.gzs = gzio_init(core_gz_write, GZIO_DEFLATE, CORE_BUF_SIZE, compress_user_cores_gzlevel, ¶ms); if (params.gzs == NULL) { error = EFAULT; goto done; } tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO); } #endif /* * Allocate memory for building the header, fill it up, * and write it out following the notes. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); - if (hdr == NULL) { - error = EINVAL; - goto done; - } error = __elfN(corehdr)(¶ms, seginfo.count, hdr, hdrsize, ¬elst, notesz); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = round_page(hdrsize + notesz); for (i = 0; i < seginfo.count; i++) { error = core_output((caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, ¶ms, tmpbuf); if (error != 0) break; offset += php->p_filesz; php++; } #ifdef GZIO if (error == 0 && compress) error = gzio_flush(params.gzs); #endif } if (error) { log(LOG_WARNING, "Failed to write core file for process %s (error %d)\n", curproc->p_comm, error); } done: #ifdef GZIO if (compress) { free(tmpbuf, M_TEMP); if (params.gzs != NULL) gzio_fini(params.gzs); } #endif while ((ninfo = TAILQ_FIRST(¬elst)) != NULL) { TAILQ_REMOVE(¬elst, ninfo, link); free(ninfo, M_TEMP); } if (hdr != NULL) free(hdr, M_TEMP); return (error); } /* * A callback for each_writable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = __elfN(untrans_prot)(entry->protection); phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_writable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(entry, closure) vm_map_entry_t entry; void *closure; { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_writable_segment(td, func, closure) struct thread *td; segment_callback func; void *closure; { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_RLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE && object->type != OBJT_PHYS; VM_OBJECT_RUNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst, size_t notesz) { struct note_info *ninfo; struct sbuf *sb; int error; /* Fill in the header. */ bzero(hdr, hdrsize); __elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz); sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_core_output, p); sbuf_start_section(sb, NULL); sbuf_bcat(sb, hdr, hdrsize); TAILQ_FOREACH(ninfo, notelst, link) __elfN(putnote)(ninfo, sb); /* Align up to a page boundary for the program segments. */ sbuf_end_section(sb, -1, PAGE_SIZE, 0); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static void __elfN(prepare_notes)(struct thread *td, struct note_info_list *list, size_t *sizep) { struct proc *p; struct thread *thr; size_t size; p = td->td_proc; size = 0; size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { size += register_note(list, NT_PRSTATUS, __elfN(note_prstatus), thr); size += register_note(list, NT_FPREGSET, __elfN(note_fpregset), thr); size += register_note(list, NT_THRMISC, __elfN(note_thrmisc), thr); size += register_note(list, -1, __elfN(note_threadmd), thr); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } size += register_note(list, NT_PROCSTAT_PROC, __elfN(note_procstat_proc), p); size += register_note(list, NT_PROCSTAT_FILES, note_procstat_files, p); size += register_note(list, NT_PROCSTAT_VMMAP, note_procstat_vmmap, p); size += register_note(list, NT_PROCSTAT_GROUPS, note_procstat_groups, p); size += register_note(list, NT_PROCSTAT_UMASK, note_procstat_umask, p); size += register_note(list, NT_PROCSTAT_RLIMIT, note_procstat_rlimit, p); size += register_note(list, NT_PROCSTAT_OSREL, note_procstat_osrel, p); size += register_note(list, NT_PROCSTAT_PSSTRINGS, __elfN(note_procstat_psstrings), p); size += register_note(list, NT_PROCSTAT_AUXV, __elfN(note_procstat_auxv), p); *sizep = size; } static void __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, size_t notesz) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; struct phdr_closure phc; ehdr = (Elf_Ehdr *)hdr; phdr = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)); ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 ehdr->e_machine = ELF_ARCH32; #else ehdr->e_machine = ELF_ARCH; #endif ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = sizeof(Elf_Ehdr); ehdr->e_flags = 0; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_phnum = numsegs + 1; ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shnum = 0; ehdr->e_shstrndx = SHN_UNDEF; /* * Fill in the program header entries. */ /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = hdrsize; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = PF_R; phdr->p_align = ELF_NOTE_ROUNDSIZE; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = round_page(hdrsize + notesz); each_writable_segment(td, cb_put_phdr, &phc); } static size_t register_note(struct note_info_list *list, int type, outfunc_t out, void *arg) { struct note_info *ninfo; size_t size, notesize; size = 0; out(arg, NULL, &size); ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK); ninfo->type = type; ninfo->outfunc = out; ninfo->outarg = arg; ninfo->outsize = size; TAILQ_INSERT_TAIL(list, ninfo, link); if (type == -1) return (size); notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static size_t append_note_data(const void *src, void *dst, size_t len) { size_t padded_len; padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE); if (dst != NULL) { bcopy(src, dst, len); bzero((char *)dst + len, padded_len - len); } return (padded_len); } size_t __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp) { Elf_Note *note; char *buf; size_t notesize; buf = dst; if (buf != NULL) { note = (Elf_Note *)buf; note->n_namesz = sizeof(FREEBSD_ABI_VENDOR); note->n_descsz = size; note->n_type = type; buf += sizeof(*note); buf += append_note_data(FREEBSD_ABI_VENDOR, buf, sizeof(FREEBSD_ABI_VENDOR)); append_note_data(src, buf, size); if (descp != NULL) *descp = buf; } notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static void __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb) { Elf_Note note; ssize_t old_len, sect_len; size_t new_len, descsz, i; if (ninfo->type == -1) { ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); return; } note.n_namesz = sizeof(FREEBSD_ABI_VENDOR); note.n_descsz = ninfo->outsize; note.n_type = ninfo->type; sbuf_bcat(sb, ¬e, sizeof(note)); sbuf_start_section(sb, &old_len); sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR)); sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (note.n_descsz == 0) return; sbuf_start_section(sb, &old_len); ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (sect_len < 0) return; new_len = (size_t)sect_len; descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE); if (new_len < descsz) { /* * It is expected that individual note emitters will correctly * predict their expected output size and fill up to that size * themselves, padding in a format-specific way if needed. * However, in case they don't, just do it here with zeros. */ for (i = 0; i < descsz - new_len; i++) sbuf_putc(sb, 0); } else if (new_len > descsz) { /* * We can't always truncate sb -- we may have drained some * of it already. */ KASSERT(new_len == descsz, ("%s: Note type %u changed as we " "read it (%zu > %zu). Since it is longer than " "expected, this coredump's notes are corrupt. THIS " "IS A BUG in the note_procstat routine for type %u.\n", __func__, (unsigned)note.n_type, new_len, descsz, (unsigned)note.n_type)); } } /* * Miscellaneous note out functions. */ #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 #include typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; typedef struct thrmisc32 elf_thrmisc_t; #define ELF_KERN_PROC_MASK KERN_PROC_MASK32 typedef struct kinfo_proc32 elf_kinfo_proc_t; typedef uint32_t elf_ps_strings_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; typedef thrmisc_t elf_thrmisc_t; #define ELF_KERN_PROC_MASK 0 typedef struct kinfo_proc elf_kinfo_proc_t; typedef vm_offset_t elf_ps_strings_t; #endif static void __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; elf_prpsinfo_t *psinfo; p = (struct proc *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*psinfo), ("invalid size")); psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK); psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); /* * XXX - We don't fill in the command line arguments properly * yet. */ strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); sbuf_bcat(sb, psinfo, sizeof(*psinfo)); free(psinfo, M_TEMP); } *sizep = sizeof(*psinfo); } static void __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prstatus_t *status; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*status), ("invalid size")); status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK); status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = td->td_proc->p_sig; status->pr_pid = td->td_tid; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_regs32(td, &status->pr_reg); #else fill_regs(td, &status->pr_reg); #endif sbuf_bcat(sb, status, sizeof(*status)); free(status, M_TEMP); } *sizep = sizeof(*status); } static void __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prfpregset_t *fpregset; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*fpregset), ("invalid size")); fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_fpregs32(td, fpregset); #else fill_fpregs(td, fpregset); #endif sbuf_bcat(sb, fpregset, sizeof(*fpregset)); free(fpregset, M_TEMP); } *sizep = sizeof(*fpregset); } static void __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_thrmisc_t thrmisc; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(thrmisc), ("invalid size")); bzero(&thrmisc._pad, sizeof(thrmisc._pad)); strcpy(thrmisc.pr_tname, td->td_name); sbuf_bcat(sb, &thrmisc, sizeof(thrmisc)); } *sizep = sizeof(thrmisc); } /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ static void __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; void *buf; size_t size; td = (struct thread *)arg; size = *sizep; if (size != 0 && sb != NULL) buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK); else buf = NULL; size = 0; __elfN(dump_thread)(td, buf, &size); KASSERT(sb == NULL || *sizep == size, ("invalid size")); if (size != 0 && sb != NULL) sbuf_bcat(sb, buf, size); free(buf, M_TEMP); *sizep = size; } #ifdef KINFO_PROC_SIZE CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #endif static void __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_numthreads * sizeof(elf_kinfo_proc_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(elf_kinfo_proc_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); sx_slock(&proctree_lock); PROC_LOCK(p); kern_proc_out(p, sb, ELF_KERN_PROC_MASK); sx_sunlock(&proctree_lock); } *sizep = size; } #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static void note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size, sect_sz, i; ssize_t start_len, sect_len; int structsize, filedesc_flags; if (coredump_pack_fileinfo) filedesc_flags = KERN_FILEDESC_PACK_KINFO; else filedesc_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_file); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, -1, filedesc_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_start_section(sb, &start_len); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize), filedesc_flags); sect_len = sbuf_end_section(sb, start_len, 0, 0); if (sect_len < 0) return; sect_sz = sect_len; KASSERT(sect_sz <= *sizep, ("kern_proc_filedesc_out did not respect maxlen; " "requested %zu, got %zu", *sizep - sizeof(structsize), sect_sz - sizeof(structsize))); for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++) sbuf_putc(sb, 0); } } #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize, vmmap_flags; if (coredump_pack_vmmapinfo) vmmap_flags = KERN_VMMAP_PACK_KINFO; else vmmap_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_vmentry); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, -1, vmmap_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize), vmmap_flags); } } static void note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(gid_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups * sizeof(gid_t)); } *sizep = size; } static void note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_fd->fd_cmask); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask)); } *sizep = size; } static void note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; struct rlimit rlim[RLIM_NLIMITS]; size_t size; int structsize, i; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(rlim); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(rlim); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); for (i = 0; i < RLIM_NLIMITS; i++) lim_rlimit_proc(p, i, &rlim[i]); PROC_UNLOCK(p); sbuf_bcat(sb, rlim, sizeof(rlim)); } *sizep = size; } static void note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_osrel); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_osrel); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel)); } *sizep = size; } static void __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; elf_ps_strings_t ps_strings; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(ps_strings); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(ps_strings); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 ps_strings = PTROUT(p->p_sysent->sv_psstrings); #else ps_strings = p->p_sysent->sv_psstrings; #endif sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &ps_strings, sizeof(ps_strings)); } *sizep = size; } static void __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_count, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { structsize = sizeof(Elf_Auxinfo); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); } } static boolean_t __elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel, const Elf_Phdr *pnote) { const Elf_Note *note, *note0, *note_end; const char *note_name; char *buf; int i, error; boolean_t res; /* We need some limit, might as well use PAGE_SIZE. */ if (pnote == NULL || pnote->p_filesz > PAGE_SIZE) return (FALSE); ASSERT_VOP_LOCKED(imgp->vp, "parse_notes"); if (pnote->p_offset > PAGE_SIZE || pnote->p_filesz > PAGE_SIZE - pnote->p_offset) { VOP_UNLOCK(imgp->vp, 0); buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz, pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL, curthread); if (error != 0) { uprintf("i/o error PT_NOTE\n"); res = FALSE; goto ret; } note = note0 = (const Elf_Note *)buf; note_end = (const Elf_Note *)(buf + pnote->p_filesz); } else { note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset); note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset + pnote->p_filesz); buf = NULL; } for (i = 0; i < 100 && note >= note0 && note < note_end; i++) { if (!aligned(note, Elf32_Addr) || (const char *)note_end - (const char *)note < sizeof(Elf_Note)) { res = FALSE; goto ret; } if (note->n_namesz != checknote->hdr.n_namesz || note->n_descsz != checknote->hdr.n_descsz || note->n_type != checknote->hdr.n_type) goto nextnote; note_name = (const char *)(note + 1); if (note_name + checknote->hdr.n_namesz >= (const char *)note_end || strncmp(checknote->vendor, note_name, checknote->hdr.n_namesz) != 0) goto nextnote; /* * Fetch the osreldate for binary * from the ELF OSABI-note if necessary. */ if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 && checknote->trans_osrel != NULL) { res = checknote->trans_osrel(note, osrel); goto ret; } res = TRUE; goto ret; nextnote: note = (const Elf_Note *)((const char *)(note + 1) + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) + roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE)); } res = FALSE; ret: free(buf, M_TEMP); return (res); } /* * Try to find the appropriate ABI-note section for checknote, * fetch the osreldate for binary from the ELF OSABI-note. Only the * first page of the image is searched, the same as for headers. */ static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel) { const Elf_Phdr *phdr; const Elf_Ehdr *hdr; int i; hdr = (const Elf_Ehdr *)imgp->image_header; phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i])) return (TRUE); } return (FALSE); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); static vm_prot_t __elfN(trans_prot)(Elf_Word flags) { vm_prot_t prot; prot = 0; if (flags & PF_X) prot |= VM_PROT_EXECUTE; if (flags & PF_W) prot |= VM_PROT_WRITE; if (flags & PF_R) prot |= VM_PROT_READ; #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) if (i386_read_exec && (flags & PF_R)) prot |= VM_PROT_EXECUTE; #endif #endif return (prot); } static Elf_Word __elfN(untrans_prot)(vm_prot_t prot) { Elf_Word flags; flags = 0; if (prot & VM_PROT_EXECUTE) flags |= PF_X; if (prot & VM_PROT_READ) flags |= PF_R; if (prot & VM_PROT_WRITE) flags |= PF_W; return (flags); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 297390) +++ head/sys/kern/kern_exec.c (revision 297391) @@ -1,1618 +1,1614 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); /* * Do the best to calculate the full path to the image file. */ if (imgp->auxargs != NULL && ((args->fname != NULL && args->fname[0] == '/') || vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0)) imgp->execpath = args->fname; if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } else { oldsigacts = NULL; newsigacts = NULL; /* satisfy gcc */ } PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; oldcred = p->p_ucred; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); if (error != 0) goto done1; newcred = crdup(oldcred); euip = uifind(attr.va_uid); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); /* * Set the new credentials. */ if (attr.va_mode & S_ISUID) change_euid(newcred, euip); if (attr.va_mode & S_ISGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interpvplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); proc_set_cred(p, newcred); } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); proc_set_cred(p, newcred); } } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); VOP_UNLOCK(imgp->vp, 0); done1: /* * Free any resources malloc'd earlier that we didn't use. */ if (euip != NULL) uifree(euip); if (newcred != NULL) crfree(oldcred); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_SHARED | LK_RETRY); pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { PROC_LOCK(p); td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); done2: #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL); if (ma[0]->valid != VM_PAGE_BITS_ALL) { if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); vm_page_xunbusy(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); vm_page_xunbusy(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_xunbusy(ma[0]); vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error) { vm_object_deallocate(obj); return (error); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. Returns zero if the allocation succeeds * and ENOMEM otherwise. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX); return (args->buf != NULL ? 0 : ENOMEM); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { kmap_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); - if (newexecsw == NULL) - return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); - if (newexecsw == NULL) - return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_linker.c =================================================================== --- head/sys/kern/kern_linker.c (revision 297390) +++ head/sys/kern/kern_linker.c (revision 297391) @@ -1,2150 +1,2148 @@ /*- * Copyright (c) 1997-2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kld.h" #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "linker_if.h" #ifdef HWPMC_HOOKS #include #endif #ifdef KLD_DEBUG int kld_debug = 0; SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN, &kld_debug, 0, "Set various levels of KLD debug"); #endif /* These variables are used by kernel debuggers to enumerate loaded files. */ const int kld_off_address = offsetof(struct linker_file, address); const int kld_off_filename = offsetof(struct linker_file, filename); const int kld_off_pathname = offsetof(struct linker_file, pathname); const int kld_off_next = offsetof(struct linker_file, link.tqe_next); /* * static char *linker_search_path(const char *name, struct mod_depend * *verinfo); */ static const char *linker_basename(const char *path); /* * Find a currently loaded file given its filename. */ static linker_file_t linker_find_file_by_name(const char* _filename); /* * Find a currently loaded file given its file id. */ static linker_file_t linker_find_file_by_id(int _fileid); /* Metadata from the static kernel */ SET_DECLARE(modmetadata_set, struct mod_metadata); MALLOC_DEFINE(M_LINKER, "linker", "kernel linker"); linker_file_t linker_kernel_file; static struct sx kld_sx; /* kernel linker lock */ /* * Load counter used by clients to determine if a linker file has been * re-loaded. This counter is incremented for each file load. */ static int loadcnt; static linker_class_list_t classes; static linker_file_list_t linker_files; static int next_file_id = 1; static int linker_no_more_classes = 0; #define LINKER_GET_NEXT_FILE_ID(a) do { \ linker_file_t lftmp; \ \ if (!cold) \ sx_assert(&kld_sx, SA_XLOCKED); \ retry: \ TAILQ_FOREACH(lftmp, &linker_files, link) { \ if (next_file_id == lftmp->id) { \ next_file_id++; \ goto retry; \ } \ } \ (a) = next_file_id; \ } while(0) /* XXX wrong name; we're looking at version provision tags here, not modules */ typedef TAILQ_HEAD(, modlist) modlisthead_t; struct modlist { TAILQ_ENTRY(modlist) link; /* chain together all modules */ linker_file_t container; const char *name; int version; }; typedef struct modlist *modlist_t; static modlisthead_t found_modules; static int linker_file_add_dependency(linker_file_t file, linker_file_t dep); static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char* name, int deps); static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp); static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo); static void linker_init(void *arg) { sx_init(&kld_sx, "kernel linker"); TAILQ_INIT(&classes); TAILQ_INIT(&linker_files); } SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0); static void linker_stop_class_add(void *arg) { linker_no_more_classes = 1; } SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL); int linker_add_class(linker_class_t lc) { /* * We disallow any class registration past SI_ORDER_ANY * of SI_SUB_KLD. We bump the reference count to keep the * ops from being freed. */ if (linker_no_more_classes == 1) return (EPERM); kobj_class_compile((kobj_class_t) lc); ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */ TAILQ_INSERT_TAIL(&classes, lc, link); return (0); } static void linker_file_sysinit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0) return; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_sysuninit(linker_file_t lf) { struct sysinit **start, **stop, **sipp, **xipp, *save; KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop, NULL) != 0) return; /* * Perform a reverse bubble sort of the system initialization objects * by their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the operation * which ensures continued function. */ for (sipp = start; sipp < stop; sipp++) { for (xipp = sipp + 1; xipp < stop; xipp++) { if ((*sipp)->subsystem > (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order >= (*xipp)->order)) continue; /* skip */ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ sx_xunlock(&kld_sx); mtx_lock(&Giant); for (sipp = start; sipp < stop; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s) */ /* Call function */ (*((*sipp)->func)) ((*sipp)->udata); } mtx_unlock(&Giant); sx_xlock(&kld_sx); } static void linker_file_register_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_register_sysctls: registering SYSCTLs for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_register_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static void linker_file_unregister_sysctls(linker_file_t lf) { struct sysctl_oid **start, **stop, **oidp; KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs" " for %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0) return; sx_xunlock(&kld_sx); sysctl_wlock(); for (oidp = start; oidp < stop; oidp++) sysctl_unregister_oid(*oidp); sysctl_wunlock(); sx_xlock(&kld_sx); } static int linker_file_register_modules(linker_file_t lf) { struct mod_metadata **start, **stop, **mdp; const moduledata_t *moddata; int first_error, error; KLD_DPF(FILE, ("linker_file_register_modules: registering modules" " in %s\n", lf->filename)); sx_assert(&kld_sx, SA_XLOCKED); if (linker_file_lookup_set(lf, "modmetadata_set", &start, &stop, NULL) != 0) { /* * This fallback should be unnecessary, but if we get booted * from boot2 instead of loader and we are missing our * metadata then we have to try the best we can. */ if (lf == linker_kernel_file) { start = SET_BEGIN(modmetadata_set); stop = SET_LIMIT(modmetadata_set); } else return (0); } first_error = 0; for (mdp = start; mdp < stop; mdp++) { if ((*mdp)->md_type != MDT_MODULE) continue; moddata = (*mdp)->md_data; KLD_DPF(FILE, ("Registering module %s in %s\n", moddata->name, lf->filename)); error = module_register(moddata, lf); if (error) { printf("Module %s failed to register: %d\n", moddata->name, error); if (first_error == 0) first_error = error; } } return (first_error); } static void linker_init_kernel_modules(void) { sx_xlock(&kld_sx); linker_file_register_modules(linker_kernel_file); sx_xunlock(&kld_sx); } SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0); static int linker_load_file(const char *filename, linker_file_t *result) { linker_class_t lc; linker_file_t lf; int foundfile, error, modules; /* Refuse to load modules if securelevel raised */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); lf = linker_find_file_by_name(filename); if (lf) { KLD_DPF(FILE, ("linker_load_file: file %s is already loaded," " incrementing refs\n", filename)); *result = lf; lf->refs++; return (0); } foundfile = 0; error = 0; /* * We do not need to protect (lock) classes here because there is * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY) * and there is no class deregistration mechanism at this time. */ TAILQ_FOREACH(lc, &classes, link) { KLD_DPF(FILE, ("linker_load_file: trying to load %s\n", filename)); error = LINKER_LOAD_FILE(lc, filename, &lf); /* * If we got something other than ENOENT, then it exists but * we cannot load it for some other reason. */ if (error != ENOENT) foundfile = 1; if (lf) { error = linker_file_register_modules(lf); if (error == EEXIST) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (error); } modules = !TAILQ_EMPTY(&lf->modules); linker_file_register_sysctls(lf); linker_file_sysinit(lf); lf->flags |= LINKER_FILE_LINKED; /* * If all of the modules in this file failed * to load, unload the file and return an * error of ENOEXEC. */ if (modules && TAILQ_EMPTY(&lf->modules)) { linker_file_unload(lf, LINKER_UNLOAD_FORCE); return (ENOEXEC); } EVENTHANDLER_INVOKE(kld_load, lf); *result = lf; return (0); } } /* * Less than ideal, but tells the user whether it failed to load or * the module was not found. */ if (foundfile) { /* * If the file type has not been recognized by the last try * printout a message before to fail. */ if (error == ENOSYS) printf("linker_load_file: Unsupported file type\n"); /* * Format not recognized or otherwise unloadable. * When loading a module that is statically built into * the kernel EEXIST percolates back up as the return * value. Preserve this so that apps like sysinstall * can recognize this special case and not post bogus * dialog boxes. */ if (error != EEXIST) error = ENOEXEC; } else error = ENOENT; /* Nothing found */ return (error); } int linker_reference_module(const char *modname, struct mod_depend *verinfo, linker_file_t *result) { modlist_t mod; int error; sx_xlock(&kld_sx); if ((mod = modlist_lookup2(modname, verinfo)) != NULL) { *result = mod->container; (*result)->refs++; sx_xunlock(&kld_sx); return (0); } error = linker_load_module(NULL, modname, NULL, verinfo, result); sx_xunlock(&kld_sx); return (error); } int linker_release_module(const char *modname, struct mod_depend *verinfo, linker_file_t lf) { modlist_t mod; int error; sx_xlock(&kld_sx); if (lf == NULL) { KASSERT(modname != NULL, ("linker_release_module: no file or name")); mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { sx_xunlock(&kld_sx); return (ESRCH); } lf = mod->container; } else KASSERT(modname == NULL && verinfo == NULL, ("linker_release_module: both file and name")); error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL); sx_xunlock(&kld_sx); return (error); } static linker_file_t linker_find_file_by_name(const char *filename) { linker_file_t lf; char *koname; koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); sprintf(koname, "%s.ko", filename); sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) { if (strcmp(lf->filename, koname) == 0) break; if (strcmp(lf->filename, filename) == 0) break; } free(koname, M_LINKER); return (lf); } static linker_file_t linker_find_file_by_id(int fileid) { linker_file_t lf; sx_assert(&kld_sx, SA_XLOCKED); TAILQ_FOREACH(lf, &linker_files, link) if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED) break; return (lf); } int linker_file_foreach(linker_predicate_t *predicate, void *context) { linker_file_t lf; int retval = 0; sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { retval = predicate(lf, context); if (retval != 0) break; } sx_xunlock(&kld_sx); return (retval); } linker_file_t linker_make_file(const char *pathname, linker_class_t lc) { linker_file_t lf; const char *filename; if (!cold) sx_assert(&kld_sx, SA_XLOCKED); filename = linker_basename(pathname); KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname)); lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK); if (lf == NULL) return (NULL); lf->ctors_addr = 0; lf->ctors_size = 0; lf->refs = 1; lf->userrefs = 0; lf->flags = 0; lf->filename = strdup(filename, M_LINKER); lf->pathname = strdup(pathname, M_LINKER); LINKER_GET_NEXT_FILE_ID(lf->id); lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); return (lf); } int linker_file_unload(linker_file_t file, int flags) { module_t mod, next; modlist_t ml, nextml; struct common_symbol *cp; int error, i; /* Refuse to unload modules if securelevel raised. */ if (prison0.pr_securelevel > 0) return (EPERM); sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); /* Easy case of just dropping a reference. */ if (file->refs > 1) { file->refs--; return (0); } /* Give eventhandlers a chance to prevent the unload. */ error = 0; EVENTHANDLER_INVOKE(kld_unload_try, file, &error); if (error != 0) return (EBUSY); KLD_DPF(FILE, ("linker_file_unload: file is unloading," " informing modules\n")); /* * Quiesce all the modules to give them a chance to veto the unload. */ MOD_SLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = module_getfnext(mod)) { error = module_quiesce(mod); if (error != 0 && flags != LINKER_UNLOAD_FORCE) { KLD_DPF(FILE, ("linker_file_unload: module %s" " vetoed unload\n", module_getname(mod))); /* * XXX: Do we need to tell all the quiesced modules * that they can resume work now via a new module * event? */ MOD_SUNLOCK; return (error); } } MOD_SUNLOCK; /* * Inform any modules associated with this file that they are * being unloaded. */ MOD_XLOCK; for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { next = module_getfnext(mod); MOD_XUNLOCK; /* * Give the module a chance to veto the unload. */ if ((error = module_unload(mod)) != 0) { #ifdef KLD_DEBUG MOD_SLOCK; KLD_DPF(FILE, ("linker_file_unload: module %s" " failed unload\n", module_getname(mod))); MOD_SUNLOCK; #endif return (error); } MOD_XLOCK; module_release(mod); } MOD_XUNLOCK; TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) { if (ml->container == file) { TAILQ_REMOVE(&found_modules, ml, link); free(ml, M_LINKER); } } /* * Don't try to run SYSUNINITs if we are unloaded due to a * link error. */ if (file->flags & LINKER_FILE_LINKED) { file->flags &= ~LINKER_FILE_LINKED; linker_file_sysuninit(file); linker_file_unregister_sysctls(file); } TAILQ_REMOVE(&linker_files, file, link); if (file->deps) { for (i = 0; i < file->ndeps; i++) linker_file_unload(file->deps[i], flags); free(file->deps, M_LINKER); file->deps = NULL; } while ((cp = STAILQ_FIRST(&file->common)) != NULL) { STAILQ_REMOVE_HEAD(&file->common, link); free(cp, M_LINKER); } LINKER_UNLOAD(file); EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address, file->size); if (file->filename) { free(file->filename, M_LINKER); file->filename = NULL; } if (file->pathname) { free(file->pathname, M_LINKER); file->pathname = NULL; } kobj_delete((kobj_t) file, M_LINKER); return (0); } int linker_ctf_get(linker_file_t file, linker_ctf_t *lc) { return (LINKER_CTF_GET(file, lc)); } static int linker_file_add_dependency(linker_file_t file, linker_file_t dep) { linker_file_t *newdeps; sx_assert(&kld_sx, SA_XLOCKED); file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps), M_LINKER, M_WAITOK | M_ZERO); file->deps[file->ndeps] = dep; file->ndeps++; KLD_DPF(FILE, ("linker_file_add_dependency:" " adding %s as dependency for %s\n", dep->filename, file->filename)); return (0); } /* * Locate a linker set and its contents. This is a helper function to avoid * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **. * This function is used in this file so we can avoid having lots of (void **) * casts. */ int linker_file_lookup_set(linker_file_t file, const char *name, void *firstp, void *lastp, int *countp) { sx_assert(&kld_sx, SA_LOCKED); return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp)); } /* * List all functions in a file. */ int linker_file_function_listall(linker_file_t lf, linker_function_nameval_callback_t callback_func, void *arg) { return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg)); } caddr_t linker_file_lookup_symbol(linker_file_t file, const char *name, int deps) { caddr_t sym; int locked; locked = sx_xlocked(&kld_sx); if (!locked) sx_xlock(&kld_sx); sym = linker_file_lookup_symbol_internal(file, name, deps); if (!locked) sx_xunlock(&kld_sx); return (sym); } static caddr_t linker_file_lookup_symbol_internal(linker_file_t file, const char *name, int deps) { c_linker_sym_t sym; linker_symval_t symval; caddr_t address; size_t common_size = 0; int i; sx_assert(&kld_sx, SA_XLOCKED); KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n", file, name, deps)); if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) { LINKER_SYMBOL_VALUES(file, sym, &symval); if (symval.value == 0) /* * For commons, first look them up in the * dependencies and only allocate space if not found * there. */ common_size = symval.size; else { KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol" ".value=%p\n", symval.value)); return (symval.value); } } if (deps) { for (i = 0; i < file->ndeps; i++) { address = linker_file_lookup_symbol_internal( file->deps[i], name, 0); if (address) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " deps value=%p\n", address)); return (address); } } } if (common_size > 0) { /* * This is a common symbol which was not found in the * dependencies. We maintain a simple common symbol table in * the file object. */ struct common_symbol *cp; STAILQ_FOREACH(cp, &file->common, link) { if (strcmp(cp->name, name) == 0) { KLD_DPF(SYM, ("linker_file_lookup_symbol:" " old common value=%p\n", cp->address)); return (cp->address); } } /* * Round the symbol size up to align. */ common_size = (common_size + sizeof(int) - 1) & -sizeof(int); cp = malloc(sizeof(struct common_symbol) + common_size + strlen(name) + 1, M_LINKER, M_WAITOK | M_ZERO); cp->address = (caddr_t)(cp + 1); cp->name = cp->address + common_size; strcpy(cp->name, name); bzero(cp->address, common_size); STAILQ_INSERT_TAIL(&file->common, cp, link); KLD_DPF(SYM, ("linker_file_lookup_symbol: new common" " value=%p\n", cp->address)); return (cp->address); } KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); return (0); } /* * Both DDB and stack(9) rely on the kernel linker to provide forward and * backward lookup of symbols. However, DDB and sometimes stack(9) need to * do this in a lockfree manner. We provide a set of internal helper * routines to perform these operations without locks, and then wrappers that * optionally lock. * * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB. */ #ifdef DDB static int linker_debug_lookup(const char *symstr, c_linker_sym_t *sym) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0) return (0); } return (ENOENT); } #endif static int linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { linker_file_t lf; c_linker_sym_t best, es; u_long diff, bestdiff, off; best = 0; off = (uintptr_t)value; bestdiff = off; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0) continue; if (es != 0 && diff < bestdiff) { best = es; bestdiff = diff; } if (bestdiff == 0) break; } if (best) { *sym = best; *diffp = bestdiff; return (0); } else { *sym = 0; *diffp = off; return (ENOENT); } } static int linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { linker_file_t lf; TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0) return (0); } return (ENOENT); } static int linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { linker_symval_t symval; c_linker_sym_t sym; int error; *offset = 0; error = linker_debug_search_symbol(value, &sym, offset); if (error) return (error); error = linker_debug_symbol_values(sym, &symval); if (error) return (error); strlcpy(buf, symval.name, buflen); return (0); } /* * DDB Helpers. DDB has to look across multiple files with their own symbol * tables and string tables. * * Note that we do not obey list locking protocols here. We really don't need * DDB to hang because somebody's got the lock held. We'll take the chance * that the files list is inconsistant instead. */ #ifdef DDB int linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym) { return (linker_debug_lookup(symstr, sym)); } #endif int linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp) { return (linker_debug_search_symbol(value, sym, diffp)); } int linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval) { return (linker_debug_symbol_values(sym, symval)); } int linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { return (linker_debug_search_symbol_name(value, buf, buflen, offset)); } /* * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do * obey locking protocols, and offer a significantly less complex interface. */ int linker_search_symbol_name(caddr_t value, char *buf, u_int buflen, long *offset) { int error; sx_slock(&kld_sx); error = linker_debug_search_symbol_name(value, buf, buflen, offset); sx_sunlock(&kld_sx); return (error); } /* * Syscalls. */ int kern_kldload(struct thread *td, const char *file, int *fileid) { const char *kldname, *modname; linker_file_t lf; int error; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); /* * It is possible that kldloaded module will attach a new ifnet, * so vnet context must be set when this ocurs. */ CURVNET_SET(TD_TO_VNET(td)); /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface * name. */ if (strchr(file, '/') || strchr(file, '.')) { kldname = file; modname = NULL; } else { kldname = NULL; modname = file; } sx_xlock(&kld_sx); error = linker_load_module(kldname, modname, NULL, NULL, &lf); if (error) { sx_xunlock(&kld_sx); goto done; } lf->userrefs++; if (fileid != NULL) *fileid = lf->id; sx_xunlock(&kld_sx); done: CURVNET_RESTORE(); return (error); } int sys_kldload(struct thread *td, struct kldload_args *uap) { char *pathname = NULL; int error, fileid; td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL); if (error == 0) { error = kern_kldload(td, pathname, &fileid); if (error == 0) td->td_retval[0] = fileid; } free(pathname, M_TEMP); return (error); } int kern_kldunload(struct thread *td, int fileid, int flags) { linker_file_t lf; int error = 0; if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); CURVNET_SET(TD_TO_VNET(td)); sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf) { KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); if (lf->userrefs == 0) { /* * XXX: maybe LINKER_UNLOAD_FORCE should override ? */ printf("kldunload: attempt to unload file that was" " loaded by the kernel\n"); error = EBUSY; } else { lf->userrefs--; error = linker_file_unload(lf, flags); if (error) lf->userrefs++; } } else error = ENOENT; sx_xunlock(&kld_sx); CURVNET_RESTORE(); return (error); } int sys_kldunload(struct thread *td, struct kldunload_args *uap) { return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL)); } int sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap) { if (uap->flags != LINKER_UNLOAD_NORMAL && uap->flags != LINKER_UNLOAD_FORCE) return (EINVAL); return (kern_kldunload(td, uap->fileid, uap->flags)); } int sys_kldfind(struct thread *td, struct kldfind_args *uap) { char *pathname; const char *filename; linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif td->td_retval[0] = -1; pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0) goto out; filename = linker_basename(pathname); sx_xlock(&kld_sx); lf = linker_find_file_by_name(filename); if (lf) td->td_retval[0] = lf->id; else error = ENOENT; sx_xunlock(&kld_sx); out: free(pathname, M_TEMP); return (error); } int sys_kldnext(struct thread *td, struct kldnext_args *uap) { linker_file_t lf; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); if (uap->fileid == 0) lf = TAILQ_FIRST(&linker_files); else { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) { error = ENOENT; goto out; } lf = TAILQ_NEXT(lf, link); } /* Skip partially loaded files. */ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED)) lf = TAILQ_NEXT(lf, link); if (lf) td->td_retval[0] = lf->id; else td->td_retval[0] = 0; out: sx_xunlock(&kld_sx); return (error); } int sys_kldstat(struct thread *td, struct kldstat_args *uap) { struct kld_file_stat stat; int error, version; /* * Check the version of the user's structure. */ if ((error = copyin(&uap->stat->version, &version, sizeof(version))) != 0) return (error); if (version != sizeof(struct kld_file_stat_1) && version != sizeof(struct kld_file_stat)) return (EINVAL); error = kern_kldstat(td, uap->fileid, &stat); if (error != 0) return (error); return (copyout(&stat, uap->stat, version)); } int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) { linker_file_t lf; int namelen; #ifdef MAC int error; error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(fileid); if (lf == NULL) { sx_xunlock(&kld_sx); return (ENOENT); } /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; if (namelen > MAXPATHLEN) namelen = MAXPATHLEN; bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; stat->address = lf->address; stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; if (namelen > MAXPATHLEN) namelen = MAXPATHLEN; bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); td->td_retval[0] = 0; return (0); } int sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap) { linker_file_t lf; module_t mp; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif sx_xlock(&kld_sx); lf = linker_find_file_by_id(uap->fileid); if (lf) { MOD_SLOCK; mp = TAILQ_FIRST(&lf->modules); if (mp != NULL) td->td_retval[0] = module_getid(mp); else td->td_retval[0] = 0; MOD_SUNLOCK; } else error = ENOENT; sx_xunlock(&kld_sx); return (error); } int sys_kldsym(struct thread *td, struct kldsym_args *uap) { char *symstr = NULL; c_linker_sym_t sym; linker_symval_t symval; linker_file_t lf; struct kld_sym_lookup lookup; int error = 0; #ifdef MAC error = mac_kld_check_stat(td->td_ucred); if (error) return (error); #endif if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0) return (error); if (lookup.version != sizeof(lookup) || uap->cmd != KLDSYM_LOOKUP) return (EINVAL); symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0) goto out; sx_xlock(&kld_sx); if (uap->fileid != 0) { lf = linker_find_file_by_id(uap->fileid); if (lf == NULL) error = ENOENT; else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t) symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); } else error = ENOENT; } else { TAILQ_FOREACH(lf, &linker_files, link) { if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 && LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) { lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, sizeof(lookup)); break; } } if (lf == NULL) error = ENOENT; } sx_xunlock(&kld_sx); out: free(symstr, M_TEMP); return (error); } /* * Preloaded module support */ static modlist_t modlist_lookup(const char *name, int ver) { modlist_t mod; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) == 0 && (ver == 0 || mod->version == ver)) return (mod); } return (NULL); } static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo) { modlist_t mod, bestmod; int ver; if (verinfo == NULL) return (modlist_lookup(name, 0)); bestmod = NULL; TAILQ_FOREACH(mod, &found_modules, link) { if (strcmp(mod->name, name) != 0) continue; ver = mod->version; if (ver == verinfo->md_ver_preferred) return (mod); if (ver >= verinfo->md_ver_minimum && ver <= verinfo->md_ver_maximum && (bestmod == NULL || ver > bestmod->version)) bestmod = mod; } return (bestmod); } static modlist_t modlist_newmodule(const char *modname, int version, linker_file_t container) { modlist_t mod; mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO); if (mod == NULL) panic("no memory for module list"); mod->container = container; mod->name = modname; mod->version = version; TAILQ_INSERT_TAIL(&found_modules, mod, link); return (mod); } static void linker_addmodules(linker_file_t lf, struct mod_metadata **start, struct mod_metadata **stop, int preload) { struct mod_metadata *mp, **mdp; const char *modname; int ver; for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; if (modlist_lookup(modname, ver) != NULL) { printf("module %s already present!\n", modname); /* XXX what can we do? this is a build error. :-( */ continue; } modlist_newmodule(modname, ver, lf); } } static void linker_preload(void *arg) { caddr_t modptr; const char *modname, *nmodname; char *modtype; linker_file_t lf, nlf; linker_class_t lc; int error; linker_file_list_t loaded_files; linker_file_list_t depended_files; struct mod_metadata *mp, *nmp; struct mod_metadata **start, **stop, **mdp, **nmdp; const struct mod_depend *verinfo; int nver; int resolves; modlist_t mod; struct sysinit **si_start, **si_stop; TAILQ_INIT(&loaded_files); TAILQ_INIT(&depended_files); TAILQ_INIT(&found_modules); error = 0; modptr = NULL; sx_xlock(&kld_sx); while ((modptr = preload_search_next_name(modptr)) != NULL) { modname = (char *)preload_search_info(modptr, MODINFO_NAME); modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); if (modname == NULL) { printf("Preloaded module at %p does not have a" " name!\n", modptr); continue; } if (modtype == NULL) { printf("Preloaded module at %p does not have a type!\n", modptr); continue; } if (bootverbose) printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); lf = NULL; TAILQ_FOREACH(lc, &classes, link) { error = LINKER_LINK_PRELOAD(lc, modname, &lf); if (!error) break; lf = NULL; } if (lf) TAILQ_INSERT_TAIL(&loaded_files, lf, loaded); } /* * First get a list of stuff in the kernel. */ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start, &stop, NULL) == 0) linker_addmodules(linker_kernel_file, start, stop, 1); /* * This is a once-off kinky bubble sort to resolve relocation * dependency requirements. */ restart: TAILQ_FOREACH(lf, &loaded_files, loaded) { error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); /* * First, look to see if we would successfully link with this * stuff. */ resolves = 1; /* unless we know otherwise */ if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop) /* it's a self reference */ continue; /* * ok, the module isn't here yet, we * are not finished */ if (modlist_lookup2(modname, verinfo) == NULL) resolves = 0; } } /* * OK, if we found our modules, we can link. So, "provide" * the modules inside and add it to the end of the link order * list. */ if (resolves) { if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; nver = ((const struct mod_version *) mp->md_data)->mv_version; if (modlist_lookup(modname, nver) != NULL) { printf("module %s already" " present!\n", modname); TAILQ_REMOVE(&loaded_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); /* we changed tailq next ptr */ goto restart; } modlist_newmodule(modname, nver, lf); } } TAILQ_REMOVE(&loaded_files, lf, loaded); TAILQ_INSERT_TAIL(&depended_files, lf, loaded); /* * Since we provided modules, we need to restart the * sort so that the previous files that depend on us * have a chance. Also, we've busted the tailq next * pointer with the REMOVE. */ goto restart; } } /* * At this point, we check to see what could not be resolved.. */ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) { TAILQ_REMOVE(&loaded_files, lf, loaded); printf("KLD file %s is missing dependencies\n", lf->filename); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } /* * We made it. Finish off the linking in the order we determined. */ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) { if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) panic("cannot add dependency"); } lf->userrefs++; /* so we can (try to) kldunload it */ error = linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, NULL); if (!error) { for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; mod = modlist_lookup2(modname, verinfo); if (mod == NULL) { printf("KLD file %s - cannot find " "dependency \"%s\"\n", lf->filename, modname); goto fail; } /* Don't count self-dependencies */ if (lf == mod->container) continue; mod->container->refs++; error = linker_file_add_dependency(lf, mod->container); if (error) panic("cannot add dependency"); } } /* * Now do relocation etc using the symbol search paths * established by the dependencies */ error = LINKER_LINK_PRELOAD_FINISH(lf); if (error) { printf("KLD file %s - could not finalize loading\n", lf->filename); goto fail; } linker_file_register_modules(lf); if (linker_file_lookup_set(lf, "sysinit_set", &si_start, &si_stop, NULL) == 0) sysinit_add(si_start, si_stop); linker_file_register_sysctls(lf); lf->flags |= LINKER_FILE_LINKED; continue; fail: TAILQ_REMOVE(&depended_files, lf, loaded); linker_file_unload(lf, LINKER_UNLOAD_FORCE); } sx_xunlock(&kld_sx); /* woohoo! we made it! */ } SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0); /* * Search for a not-loaded module by name. * * Modules may be found in the following locations: * * - preloaded (result is just the module name) - on disk (result is full path * to module) * * If the module name is qualified in any way (contains path, etc.) the we * simply return a copy of it. * * The search path can be manipulated via sysctl. Note that we use the ';' * character as a separator to be consistent with the bootloader. */ static char linker_hintfile[] = "linker.hints"; static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules"; SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path, sizeof(linker_path), "module load search path"); TUNABLE_STR("module_path", linker_path, sizeof(linker_path)); static char *linker_ext_list[] = { "", ".ko", NULL }; /* * Check if file actually exists either with or without extension listed in * the linker_ext_list. (probably should be generic for the rest of the * kernel) */ static char * linker_lookup_file(const char *path, int pathlen, const char *name, int namelen, struct vattr *vap) { struct nameidata nd; struct thread *td = curthread; /* XXX */ char *result, **cpp, *sep; int error, len, extlen, reclen, flags; enum vtype type; extlen = 0; for (cpp = linker_ext_list; *cpp; cpp++) { len = strlen(*cpp); if (len > extlen) extlen = len; } extlen++; /* trailing '\0' */ sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = pathlen + strlen(sep) + namelen + extlen + 1; result = malloc(reclen, M_LINKER, M_WAITOK); for (cpp = linker_ext_list; *cpp; cpp++) { snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep, namelen, name, *cpp); /* * Attempt to open the file, and return the path if * we succeed and it's a regular file. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); type = nd.ni_vp->v_type; if (vap) VOP_GETATTR(nd.ni_vp, vap, td->td_ucred); VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (type == VREG) return (result); } } free(result, M_LINKER); return (NULL); } #define INT_ALIGN(base, ptr) ptr = \ (base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1)) /* * Lookup KLD which contains requested module in the "linker.hints" file. If * version specification is available, then try to find the best KLD. * Otherwise just find the latest one. */ static char * linker_hints_lookup(const char *path, int pathlen, const char *modname, int modnamelen, const struct mod_depend *verinfo) { struct thread *td = curthread; /* XXX */ struct ucred *cred = td ? td->td_ucred : NULL; struct nameidata nd; struct vattr vattr, mattr; u_char *hints = NULL; u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep; int error, ival, bestver, *intp, found, flags, clen, blen; ssize_t reclen; result = NULL; bestver = found = 0; sep = (path[pathlen - 1] != '/') ? "/" : ""; reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen + strlen(sep) + 1; pathbuf = malloc(reclen, M_LINKER, M_WAITOK); snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep, linker_hintfile); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) goto bad; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) goto bad; best = cp = NULL; error = VOP_GETATTR(nd.ni_vp, &vattr, cred); if (error) goto bad; /* * XXX: we need to limit this number to some reasonable value */ if (vattr.va_size > LINKER_HINTS_MAX) { printf("hints file too large %ld\n", (long)vattr.va_size); goto bad; } hints = malloc(vattr.va_size, M_TEMP, M_WAITOK); - if (hints == NULL) - goto bad; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0, UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td); if (error) goto bad; VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); nd.ni_vp = NULL; if (reclen != 0) { printf("can't read %zd\n", reclen); goto bad; } intp = (int *)hints; ival = *intp++; if (ival != LINKER_HINTS_VERSION) { printf("hints file version mismatch %d\n", ival); goto bad; } bufend = hints + vattr.va_size; recptr = (u_char *)intp; clen = blen = 0; while (recptr < bufend && !found) { intp = (int *)recptr; reclen = *intp++; ival = *intp++; cp = (char *)intp; switch (ival) { case MDT_VERSION: clen = *cp++; if (clen != modnamelen || bcmp(cp, modname, clen) != 0) break; cp += clen; INT_ALIGN(hints, cp); ival = *(int *)cp; cp += sizeof(int); clen = *cp++; if (verinfo == NULL || ival == verinfo->md_ver_preferred) { found = 1; break; } if (ival >= verinfo->md_ver_minimum && ival <= verinfo->md_ver_maximum && ival > bestver) { bestver = ival; best = cp; blen = clen; } break; default: break; } recptr += reclen + sizeof(int); } /* * Finally check if KLD is in the place */ if (found) result = linker_lookup_file(path, pathlen, cp, clen, &mattr); else if (best) result = linker_lookup_file(path, pathlen, best, blen, &mattr); /* * KLD is newer than hints file. What we should do now? */ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >)) printf("warning: KLD '%s' is newer than the linker.hints" " file\n", result); bad: free(pathbuf, M_LINKER); if (hints) free(hints, M_TEMP); if (nd.ni_vp != NULL) { VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, cred, td); } /* * If nothing found or hints is absent - fallback to the old * way by using "kldname[.ko]" as module name. */ if (!found && !bestver && result == NULL) result = linker_lookup_file(path, pathlen, modname, modnamelen, NULL); return (result); } /* * Lookup KLD which contains requested module in the all directories. */ static char * linker_search_module(const char *modname, int modnamelen, const struct mod_depend *verinfo) { char *cp, *ep, *result; /* * traverse the linker path */ for (cp = linker_path; *cp; cp = ep + 1) { /* find the end of this component */ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++); result = linker_hints_lookup(cp, ep - cp, modname, modnamelen, verinfo); if (result != NULL) return (result); if (*ep == 0) break; } return (NULL); } /* * Search for module in all directories listed in the linker_path. */ static char * linker_search_kld(const char *name) { char *cp, *ep, *result; int len; /* qualified at all? */ if (strchr(name, '/')) return (strdup(name, M_LINKER)); /* traverse the linker path */ len = strlen(name); for (ep = linker_path; *ep; ep++) { cp = ep; /* find the end of this component */ for (; *ep != 0 && *ep != ';'; ep++); result = linker_lookup_file(cp, ep - cp, name, len, NULL); if (result != NULL) return (result); } return (NULL); } static const char * linker_basename(const char *path) { const char *filename; filename = strrchr(path, '/'); if (filename == NULL) return path; if (filename[1]) filename++; return (filename); } #ifdef HWPMC_HOOKS /* * Inform hwpmc about the set of kernel modules currently loaded. */ void * linker_hwpmc_list_objects(void) { linker_file_t lf; struct pmckern_map_in *kobase; int i, nmappings; nmappings = 0; sx_slock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) nmappings++; /* Allocate nmappings + 1 entries. */ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER, M_WAITOK | M_ZERO); i = 0; TAILQ_FOREACH(lf, &linker_files, link) { /* Save the info for this linker file. */ kobase[i].pm_file = lf->filename; kobase[i].pm_address = (uintptr_t)lf->address; i++; } sx_sunlock(&kld_sx); KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?")); /* The last entry of the malloced area comprises of all zeros. */ KASSERT(kobase[i].pm_file == NULL, ("linker_hwpmc_list_objects: last object not NULL")); return ((void *)kobase); } #endif /* * Find a file which contains given module and load it, if "parent" is not * NULL, register a reference to it. */ static int linker_load_module(const char *kldname, const char *modname, struct linker_file *parent, const struct mod_depend *verinfo, struct linker_file **lfpp) { linker_file_t lfdep; const char *filename; char *pathname; int error; sx_assert(&kld_sx, SA_XLOCKED); if (modname == NULL) { /* * We have to load KLD */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); else if (rootvnode == NULL) pathname = NULL; else /* * Need to find a KLD with required module */ pathname = linker_search_module(modname, strlen(modname), verinfo); } if (pathname == NULL) return (ENOENT); /* * Can't load more than one file with the same basename XXX: * Actually it should be possible to have multiple KLDs with * the same basename but different path because they can * provide different versions of the same modules. */ filename = linker_basename(pathname); if (linker_find_file_by_name(filename)) error = EEXIST; else do { error = linker_load_file(pathname, &lfdep); if (error) break; if (modname && verinfo && modlist_lookup2(modname, verinfo) == NULL) { linker_file_unload(lfdep, LINKER_UNLOAD_FORCE); error = ENOENT; break; } if (parent) { error = linker_file_add_dependency(parent, lfdep); if (error) break; } if (lfpp) *lfpp = lfdep; } while (0); free(pathname, M_LINKER); return (error); } /* * This routine is responsible for finding dependencies of userland initiated * kldload(2)'s of files. */ int linker_load_dependencies(linker_file_t lf) { linker_file_t lfdep; struct mod_metadata **start, **stop, **mdp, **nmdp; struct mod_metadata *mp, *nmp; const struct mod_depend *verinfo; modlist_t mod; const char *modname, *nmodname; int ver, error = 0, count; /* * All files are dependant on /kernel. */ sx_assert(&kld_sx, SA_XLOCKED); if (linker_kernel_file) { linker_kernel_file->refs++; error = linker_file_add_dependency(lf, linker_kernel_file); if (error) return (error); } if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop, &count) != 0) return (0); for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_VERSION) continue; modname = mp->md_cval; ver = ((const struct mod_version *)mp->md_data)->mv_version; mod = modlist_lookup(modname, ver); if (mod != NULL) { printf("interface %s.%d already present in the KLD" " '%s'!\n", modname, ver, mod->container->filename); return (EEXIST); } } for (mdp = start; mdp < stop; mdp++) { mp = *mdp; if (mp->md_type != MDT_DEPEND) continue; modname = mp->md_cval; verinfo = mp->md_data; nmodname = NULL; for (nmdp = start; nmdp < stop; nmdp++) { nmp = *nmdp; if (nmp->md_type != MDT_VERSION) continue; nmodname = nmp->md_cval; if (strcmp(modname, nmodname) == 0) break; } if (nmdp < stop)/* early exit, it's a self reference */ continue; mod = modlist_lookup2(modname, verinfo); if (mod) { /* woohoo, it's loaded already */ lfdep = mod->container; lfdep->refs++; error = linker_file_add_dependency(lf, lfdep); if (error) break; continue; } error = linker_load_module(NULL, modname, lf, verinfo, NULL); if (error) { printf("KLD %s: depends on %s - not available or" " version mismatch\n", lf->filename, modname); break; } } if (error) return (error); linker_addmodules(lf, start, stop, 0); return (error); } static int sysctl_kern_function_list_iterate(const char *name, void *opaque) { struct sysctl_req *req; req = opaque; return (SYSCTL_OUT(req, name, strlen(name) + 1)); } /* * Export a nul-separated, double-nul-terminated list of all function names * in the kernel. */ static int sysctl_kern_function_list(SYSCTL_HANDLER_ARGS) { linker_file_t lf; int error; #ifdef MAC error = mac_kld_check_stat(req->td->td_ucred); if (error) return (error); #endif error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_xlock(&kld_sx); TAILQ_FOREACH(lf, &linker_files, link) { error = LINKER_EACH_FUNCTION_NAME(lf, sysctl_kern_function_list_iterate, req); if (error) { sx_xunlock(&kld_sx); return (error); } } sx_xunlock(&kld_sx); return (SYSCTL_OUT(req, "", 1)); } SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_kern_function_list, "", "kernel function list"); Index: head/sys/net/rtsock.c =================================================================== --- head/sys/net/rtsock.c (revision 297390) +++ head/sys/net/rtsock.c (revision 297391) @@ -1,1927 +1,1925 @@ /*- * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_mpath.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; static VNET_DEFINE(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); struct walkarg { int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *m, struct socket *so, ...); static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void rts_init(void) { int tmp; if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, struct rawcb *rp) { int fibnum; KASSERT(m != NULL, ("%s: m is NULL", __func__)); KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); /* No filtering requested. */ if ((m->m_flags & RTS_FILTER_FIB) == 0) return (0); /* Check if it is a rts and the fib matches the one of the socket. */ fibnum = M_GETFIB(m); if (proto->sp_family != PF_ROUTE || rp->rcb_socket == NULL || rp->rcb_socket->so_fibnum == fibnum) return (0); /* Filtering requested and no match, the socket shall be skipped. */ return (1); } static void rts_input(struct mbuf *m) { struct sockproto route_proto; unsigned short *family; struct m_tag *tag; route_proto.sp_family = PF_ROUTE; tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); if (tag != NULL) { family = (unsigned short *)(tag + 1); route_proto.sp_protocol = *family; m_tag_delete(m, tag); } else route_proto.sp_protocol = 0; raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* * It really doesn't make any sense at all for this code to share much * with raw_usrreq.c, since its functionality is so restricted. XXX */ static void rts_abort(struct socket *so) { raw_usrreqs.pru_abort(so); } static void rts_close(struct socket *so) { raw_usrreqs.pru_close(so); } /* pru_accept is EOPNOTSUPP */ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); - if (rp == NULL) - return ENOBUFS; so->so_pcb = (caddr_t)rp; so->so_fibnum = td->td_proc->p_fibnum; error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { so->so_pcb = NULL; free(rp, M_PCB); return error; } RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; return 0; } static int rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ /* pru_control is EOPNOTSUPP */ static void rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); KASSERT(rp != NULL, ("rts_detach: rp == NULL")); RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } static int rts_disconnect(struct socket *so) { return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ /* pru_rcvoob is EOPNOTSUPP */ static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); } /* pru_sense is null */ static int rts_shutdown(struct socket *so) { return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { .pru_abort = rts_abort, .pru_attach = rts_attach, .pru_bind = rts_bind, .pru_connect = rts_connect, .pru_detach = rts_detach, .pru_disconnect = rts_disconnect, .pru_peeraddr = rts_peeraddr, .pru_send = rts_send, .pru_shutdown = rts_shutdown, .pru_sockaddr = rts_sockaddr, .pru_close = rts_close, }; #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } /*ARGSUSED*/ static int route_output(struct mbuf *m, struct socket *so, ...) { struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct rib_head *rnh; struct rt_addrinfo info; struct sockaddr_storage ss; #ifdef INET6 struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; struct rawcb *rp = NULL; struct walkarg w; fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); bzero(&w, sizeof(w)); if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ rtm->rtm_pid = curproc->p_pid; info.rti_addrs = rtm->rtm_addrs; info.rti_mflags = rtm->rtm_inits; info.rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error) senderr(error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct rt_addrinfo ginfo; struct sockaddr *gdst; bzero(&ginfo, sizeof(ginfo)); bzero(&ss, sizeof(ss)); ss.ss_len = sizeof(ss); ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; gdst = info.rti_info[RTAX_GATEWAY]; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { if (ss.ss_family == AF_LINK && ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { info.rti_flags &= ~RTF_GATEWAY; info.rti_flags |= RTF_GWFLAG_COMPAT; } rib_free_info(&ginfo); } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, fibnum); if (error == 0 && saved_nrt != NULL) { #ifdef INET6 rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif RT_LOCK(saved_nrt); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: saved_nrt = NULL; /* support for new ARP code */ if (info.rti_info[RTAX_GATEWAY] && (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); #ifdef INET6 if (error == 0) rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; } error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } #ifdef INET6 /* rt_msg2() will not be used when RTM_DELETE fails. */ rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; #endif break; case RTM_GET: rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { /* * Provide logest prefix match for * address lookup (no mask). * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH /* * for RTM_CHANGE/LOCK, if we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. * * for RTM_GET, gate is optional even with multipath. * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } #endif /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr laddr; if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, rt->rt_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); RIB_RUNLOCK(rnh); report: RT_LOCK_ASSERT(rt); if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(curthread->td_ucred) : prison_if(curthread->td_ucred, rt_key(rt)) != 0) { RT_UNLOCK(rt); senderr(ESRCH); } info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(&info, ifp, rt, &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } if (ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info.rti_info[RTAX_IFP] = NULL; info.rti_info[RTAX_IFA] = NULL; } } else if ((ifp = rt->rt_ifp) != NULL) { rtm->rtm_index = ifp->if_index; } /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *new_rtm; new_rtm = malloc(len, M_TEMP, M_NOWAIT); if (new_rtm == NULL) { RT_UNLOCK(rt); senderr(ENOBUFS); } bcopy(rtm, new_rtm, rtm->rtm_msglen); free(rtm, M_TEMP); rtm = new_rtm; alloc_len = len; } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } flush: if (rt != NULL) RTFREE(rt); /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } if (rtm != NULL) { #ifdef INET6 if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } } #endif if (error != 0) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by * invalidating our protocol */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else rt_dispatch(m, saf); } return (error); } static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = rt->rt_mtu; out->rmx_weight = rt->rt_weight; out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ out->rmx_expire = rt->rt_expire ? rt->rt_expire - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) return (EINVAL); /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ static struct sockaddr * rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; int len, buflen = 0, dlen; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) len = sizeof(struct ifa_msghdrl32); else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { #ifdef INET6 if (V_deembed_scopeid && sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)&ss; bcopy(sa, sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) sa = (struct sockaddr *)sin6; } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else buflen -= dlen; } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occured, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal. * Please do not call directly, use rt_routemsg(). * Note that @rt data MAY be inconsistent/invalid: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = sa = rt_key(rt); info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; rtm->rtm_flags |= rt->rt_flags; rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; /* * Preserve the family from the sockaddr, if any, in an m_tag for * use when injecting the mbuf into the routing socket buffer from * the netisr. */ if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } #endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct radix_node *rn, void *vw) { struct walkarg *w = vw; struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; if ((rt->rt_flags & RTF_HOST) == 0 ? jailed_without_vnet(w->w_req->td->td_ucred) : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; if (rt->rt_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rt->rt_flags & ~RTF_GWFLAG_COMPAT); else rtm->rtm_flags = rt->rt_flags; rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifd = &ifm->ifm_data; } if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &info, w, len); else error = sysctl_iflist_ifm(ifp, &info, w, len); if (error) goto done; } while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; int len, error = 0; struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) { IF_ADDR_RUNLOCK(ifp); goto done; } } } IF_ADDR_RUNLOCK(ifp); } done: IFNET_RUNLOCK_NOSLEEP(); return (error); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; name ++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { RIB_RLOCK(rnh); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw[] = { { .pr_type = SOCK_RAW, .pr_domain = &routedomain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_output = route_output, .pr_ctlinput = raw_ctlinput, .pr_init = raw_init, .pr_usrreqs = &route_usrreqs } }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] }; VNET_DOMAIN_SET(route); Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 297390) +++ head/sys/netinet/tcp_subr.c (revision 297391) @@ -1,2997 +1,2995 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #include #endif /*IPSEC*/ #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #ifdef TCP_SIGNATURE static int tcp_sig_checksigs = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW, &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic"); #endif VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); static void tcp_timer_discard(struct tcpcb *, uint32_t); static struct tcp_function_block tcp_def_funcblk = { "default", tcp_output, tcp_do_segment, tcp_default_ctloutput, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0 }; int t_functions_inited = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; static void init_tcp_functions(void) { if (t_functions_inited == 0) { TAILQ_INIT(&t_functions); rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); t_functions_inited = 1; } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { linesz = snprintf(cp, bufsz, "%-32s%c %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; struct osd osd; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } int register_tcp_functions(struct tcp_function_block *blk, int wait) { struct tcp_function_block *lblk; struct tcp_function *n; struct tcp_function_set fs; if (t_functions_inited == 0) { init_tcp_functions(); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timers_left || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timers_left == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { return (EINVAL); } } n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { return (ENOMEM); } n->tf_fb = blk; strcpy(fs.function_set_name, blk->tfb_tcp_block_name); rw_wlock(&tcp_function_lock); lblk = find_tcp_functions_locked(&fs); if (lblk) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); return (EALREADY); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_flags = 0; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); rw_wunlock(&tcp_function_lock); return(0); } int deregister_tcp_functions(struct tcp_function_block *blk) { struct tcp_function_block *lblk; struct tcp_function *f; int error=ENOENT; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (blk->tfb_refcnt) { /* Still tcb attached, mark it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; rw_wunlock(&tcp_function_lock); return (EBUSY); } lblk = find_tcp_fb_locked(blk, &f); if (lblk) { /* Found */ TAILQ_REMOVE(&t_functions, f, tf_next); f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); error = 0; } rw_wunlock(&tcp_function_lock); return (error); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose && IS_DEFAULT_VNET(curvnet)) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; tcp_reass_global_init(); /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; /* Setup the tcp function block list */ init_tcp_functions(); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); #ifdef TCPPCAP tcp_pcap_init(); #endif #ifdef TCP_RFC7413 tcp_fastopen_init(); #endif } #ifdef VIMAGE void tcp_destroy(void) { int error; #ifdef TCP_RFC7413 tcp_fastopen_destroy(); #endif tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } } #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win; bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { /* * reuse the mbuf. * XXX MRT We inherrit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef TCP_SIGNATURE if (to.to_flags & TOF_SIGNATURE) { tcp_signature_compute(m, 0, 0, optlen, to.to_signature, IPSEC_DIR_OUTBOUND); } #endif m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), tp, nth); TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET (void) ip_output(m, NULL, NULL, 0, NULL, inp); #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ int released; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* Call the stop-all function of the methods */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); khelp_destroy_osd(tp->osd); CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on tcpcb, let's free it. */ if ((tp->t_fb->tfb_tcp_timers_left) && (tp->t_fb->tfb_tcp_timers_left(tp))) { /* Some fb timers left running! */ return; } if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } void tcp_timer_2msl_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL); } void tcp_timer_keep_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP); } void tcp_timer_persist_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST); } void tcp_timer_rexmt_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT); } void tcp_timer_delack_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK); } void tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type) { struct inpcb *inp; CURVNET_SET(tp->t_vnet); INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, ("%s: tcpcb has to be stopped here", __func__)); KASSERT((tp->t_timers->tt_flags & timer_type) != 0, ("%s: discard callout should be running", __func__)); tp->t_timers->tt_flags &= ~timer_type; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on this tcpcb, let's free it. */ if ((tp->t_fb->tfb_tcp_timers_left) && (tp->t_fb->tfb_tcp_timers_left(tp))) { /* Some fb timers left running! */ goto leave; } if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } } leave: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif #ifdef TCP_RFC7413 /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); TCPSTATES_DEC(tp->t_state); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); } INP_WUNLOCK(inpb); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_rt) { RTFREE(inp->inp_route.ro_rt); inp->inp_route.ro_rt = (struct rtentry *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = V_tcbinfo.ipi_count + counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); m = counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { in_pcbref(inp); inp_list[i++] = inp; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_flags & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (xt.xt_tp.t_timers) tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); } if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; else if (PRC_IS_REDIRECT(cmd)) { /* signal EHOSTDOWN, as it flushes the cached route */ in_pcbnotifyall(&V_tcbinfo, faddr, EHOSTDOWN, notify); return; } /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = ntohl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. */ if (!mtu) mtu = ip_next_mtu( ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_WUNLOCK(inp); } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, th); } INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); bzero(&inc, sizeof(inc)); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_flags |= INC_ISIPV6; INP_INFO_RLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); INP_INFO_RUNLOCK(&V_tcbinfo); } else in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct tcpcb *tp) { MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { tcp_mtudisc(inp, -1); return (inp); } static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); tp->t_fb->tfb_tcp_output(tp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop4_extended nh4; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, NHR_REF, 0, &nh4) != 0) return (0); ifp = nh4.nh_ifp; maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } #endif /* INET */ #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop6_extended nh6; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, 0, &nh6) != 0) return (0); ifp = nh6.nh_ifp; maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PAD(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || (!key_havesp(IPSEC_DIR_OUTBOUND))) return (0); m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 */ struct secasvar * tcp_get_sav(struct mbuf *m, u_int direction) { union sockaddr_union dst; struct secasvar *sav; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; char ip6buf[INET6_ADDRSTRLEN]; #endif /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { #ifdef INET case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; #endif #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); dst.sa.sa_len = sizeof(struct sockaddr_in6); dst.sa.sa_family = AF_INET6; dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? ip6->ip6_src : ip6->ip6_dst; break; #endif default: return (NULL); /* NOTREACHED */ break; } /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : #ifdef INET6 (ip->ip_v == (IPV6_VERSION >> 4)) ? ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); } return (sav); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * sav pointer to security assosiation * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * Releases reference to SADB key before return. * * Return 0 if successful, otherwise return -1. * */ int tcp_signature_do_compute(struct mbuf *m, int len, int optlen, u_char *buf, struct secasvar *sav) { #ifdef INET struct ippseudo ippseudo; #endif MD5_CTX ctx; int doff; struct ip *ip; #ifdef INET struct ipovly *ipovly; #endif struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; uint32_t plen; uint16_t nhdr; #endif u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { #ifdef INET case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; #endif #ifdef INET6 /* * RFC 2385, 2.0 Proposal * For IPv6, the pseudo-header is as described in RFC 2460, namely the * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- * extended next header value (to form 32 bits), and 32-bit segment * length. * Note: Upper-Layer Packet Length comes before Next Header. */ case (IPV6_VERSION >> 4): in6 = ip6->ip6_src; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); in6 = ip6->ip6_dst; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); plen = htonl(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); nhdr = 0; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); nhdr = IPPROTO_TCP; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; break; #endif default: KEY_FREESAV(&sav); return (-1); /* NOTREACHED */ break; } /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Return 0 if successful, otherwise return -1. */ int tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { struct secasvar *sav; if ((sav = tcp_get_sav(m, direction)) == NULL) return (-1); return (tcp_signature_do_compute(m, len, optlen, buf, sav)); } /* * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * Return 1 if successful, otherwise return 0. */ int tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) { char tmpdigest[TCP_SIGLEN]; if (tcp_sig_checksigs == 0) return (1); if ((tcpbflag & TF_SIGNATURE) == 0) { if ((to->to_flags & TOF_SIGNATURE) != 0) { /* * If this socket is not expecting signature but * the segment contains signature just fail. */ TCPSTAT_INC(tcps_sig_err_sigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } /* Signature is not expected, and not present in segment. */ return (1); } /* * If this socket is expecting signature but the segment does not * contain any just fail. */ if ((to->to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0], IPSEC_DIR_INBOUND) == -1) { TCPSTAT_INC(tcps_sig_err_buildsig); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) { TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } TCPSTAT_INC(tcps_sig_rcvgoodsig); return (1); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } Index: head/sys/rpc/rpc_generic.c =================================================================== --- head/sys/rpc/rpc_generic.c (revision 297390) +++ head/sys/rpc/rpc_generic.c (revision 297391) @@ -1,886 +1,878 @@ /* $NetBSD: rpc_generic.c,v 1.4 2000/09/28 09:07:04 kleink Exp $ */ /*- * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1986-1991 by Sun Microsystems Inc. */ /* #pragma ident "@(#)rpc_generic.c 1.17 94/04/24 SMI" */ #include __FBSDID("$FreeBSD$"); /* * rpc_generic.c, Miscl routines for RPC. * */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern u_long sb_max_adj; /* not defined in socketvar.h */ #if __FreeBSD_version < 700000 #define strrchr rindex #endif /* Provide an entry point hook for the rpcsec_gss module. */ struct rpc_gss_entries rpc_gss_entries; struct handle { NCONF_HANDLE *nhandle; int nflag; /* Whether NETPATH or NETCONFIG */ int nettype; }; static const struct _rpcnettype { const char *name; const int type; } _rpctypelist[] = { { "netpath", _RPC_NETPATH }, { "visible", _RPC_VISIBLE }, { "circuit_v", _RPC_CIRCUIT_V }, { "datagram_v", _RPC_DATAGRAM_V }, { "circuit_n", _RPC_CIRCUIT_N }, { "datagram_n", _RPC_DATAGRAM_N }, { "tcp", _RPC_TCP }, { "udp", _RPC_UDP }, { 0, _RPC_NONE } }; struct netid_af { const char *netid; int af; int protocol; }; static const struct netid_af na_cvt[] = { { "udp", AF_INET, IPPROTO_UDP }, { "tcp", AF_INET, IPPROTO_TCP }, #ifdef INET6 { "udp6", AF_INET6, IPPROTO_UDP }, { "tcp6", AF_INET6, IPPROTO_TCP }, #endif { "local", AF_LOCAL, 0 } }; struct rpc_createerr rpc_createerr; /* * Find the appropriate buffer size */ u_int /*ARGSUSED*/ __rpc_get_t_size(int af, int proto, int size) { int defsize; switch (proto) { case IPPROTO_TCP: defsize = 64 * 1024; /* XXX */ break; case IPPROTO_UDP: defsize = UDPMSGSIZE; break; default: defsize = RPC_MAXDATASIZE; break; } if (size == 0) return defsize; /* Check whether the value is within the upper max limit */ return (size > sb_max_adj ? (u_int)sb_max_adj : (u_int)size); } /* * Find the appropriate address buffer size */ u_int __rpc_get_a_size(af) int af; { switch (af) { case AF_INET: return sizeof (struct sockaddr_in); #ifdef INET6 case AF_INET6: return sizeof (struct sockaddr_in6); #endif case AF_LOCAL: return sizeof (struct sockaddr_un); default: break; } return ((u_int)RPC_MAXADDRSIZE); } #if 0 /* * Used to ping the NULL procedure for clnt handle. * Returns NULL if fails, else a non-NULL pointer. */ void * rpc_nullproc(clnt) CLIENT *clnt; { struct timeval TIMEOUT = {25, 0}; if (clnt_call(clnt, NULLPROC, (xdrproc_t) xdr_void, NULL, (xdrproc_t) xdr_void, NULL, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return ((void *) clnt); } #endif int __rpc_socket2sockinfo(struct socket *so, struct __rpc_sockinfo *sip) { int type, proto; struct sockaddr *sa; sa_family_t family; struct sockopt opt; int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) return 0; sip->si_alen = sa->sa_len; family = sa->sa_family; free(sa, M_SONAME); opt.sopt_dir = SOPT_GET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_TYPE; opt.sopt_val = &type; opt.sopt_valsize = sizeof type; opt.sopt_td = NULL; error = sogetopt(so, &opt); if (error) return 0; /* XXX */ if (family != AF_LOCAL) { if (type == SOCK_STREAM) proto = IPPROTO_TCP; else if (type == SOCK_DGRAM) proto = IPPROTO_UDP; else return 0; } else proto = 0; sip->si_af = family; sip->si_proto = proto; sip->si_socktype = type; return 1; } /* * Linear search, but the number of entries is small. */ int __rpc_nconf2sockinfo(const struct netconfig *nconf, struct __rpc_sockinfo *sip) { int i; for (i = 0; i < (sizeof na_cvt) / (sizeof (struct netid_af)); i++) if (strcmp(na_cvt[i].netid, nconf->nc_netid) == 0 || ( strcmp(nconf->nc_netid, "unix") == 0 && strcmp(na_cvt[i].netid, "local") == 0)) { sip->si_af = na_cvt[i].af; sip->si_proto = na_cvt[i].protocol; sip->si_socktype = __rpc_seman2socktype((int)nconf->nc_semantics); if (sip->si_socktype == -1) return 0; sip->si_alen = __rpc_get_a_size(sip->si_af); return 1; } return 0; } struct socket * __rpc_nconf2socket(const struct netconfig *nconf) { struct __rpc_sockinfo si; struct socket *so; int error; if (!__rpc_nconf2sockinfo(nconf, &si)) return 0; so = NULL; error = socreate(si.si_af, &so, si.si_socktype, si.si_proto, curthread->td_ucred, curthread); if (error) return NULL; else return so; } char * taddr2uaddr(const struct netconfig *nconf, const struct netbuf *nbuf) { struct __rpc_sockinfo si; if (!__rpc_nconf2sockinfo(nconf, &si)) return NULL; return __rpc_taddr2uaddr_af(si.si_af, nbuf); } struct netbuf * uaddr2taddr(const struct netconfig *nconf, const char *uaddr) { struct __rpc_sockinfo si; if (!__rpc_nconf2sockinfo(nconf, &si)) return NULL; return __rpc_uaddr2taddr_af(si.si_af, uaddr); } char * __rpc_taddr2uaddr_af(int af, const struct netbuf *nbuf) { char *ret; struct sbuf sb; struct sockaddr_in *sin; struct sockaddr_un *sun; char namebuf[INET_ADDRSTRLEN]; #ifdef INET6 struct sockaddr_in6 *sin6; char namebuf6[INET6_ADDRSTRLEN]; #endif u_int16_t port; sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND); switch (af) { case AF_INET: sin = nbuf->buf; if (inet_ntop(af, &sin->sin_addr, namebuf, sizeof namebuf) == NULL) return NULL; port = ntohs(sin->sin_port); if (sbuf_printf(&sb, "%s.%u.%u", namebuf, ((uint32_t)port) >> 8, port & 0xff) < 0) return NULL; break; #ifdef INET6 case AF_INET6: sin6 = nbuf->buf; if (inet_ntop(af, &sin6->sin6_addr, namebuf6, sizeof namebuf6) == NULL) return NULL; port = ntohs(sin6->sin6_port); if (sbuf_printf(&sb, "%s.%u.%u", namebuf6, ((uint32_t)port) >> 8, port & 0xff) < 0) return NULL; break; #endif case AF_LOCAL: sun = nbuf->buf; if (sbuf_printf(&sb, "%.*s", (int)(sun->sun_len - offsetof(struct sockaddr_un, sun_path)), sun->sun_path) < 0) return (NULL); break; default: return NULL; } sbuf_finish(&sb); ret = strdup(sbuf_data(&sb), M_RPC); sbuf_delete(&sb); return ret; } struct netbuf * __rpc_uaddr2taddr_af(int af, const char *uaddr) { struct netbuf *ret = NULL; char *addrstr, *p; unsigned port, portlo, porthi; struct sockaddr_in *sin; #ifdef INET6 struct sockaddr_in6 *sin6; #endif struct sockaddr_un *sun; port = 0; sin = NULL; addrstr = strdup(uaddr, M_RPC); if (addrstr == NULL) return NULL; /* * AF_LOCAL addresses are expected to be absolute * pathnames, anything else will be AF_INET or AF_INET6. */ if (*addrstr != '/') { p = strrchr(addrstr, '.'); if (p == NULL) goto out; portlo = (unsigned)strtol(p + 1, NULL, 10); *p = '\0'; p = strrchr(addrstr, '.'); if (p == NULL) goto out; porthi = (unsigned)strtol(p + 1, NULL, 10); *p = '\0'; port = (porthi << 8) | portlo; } ret = (struct netbuf *)malloc(sizeof *ret, M_RPC, M_WAITOK); - if (ret == NULL) - goto out; switch (af) { case AF_INET: sin = (struct sockaddr_in *)malloc(sizeof *sin, M_RPC, M_WAITOK); - if (sin == NULL) - goto out; memset(sin, 0, sizeof *sin); sin->sin_family = AF_INET; sin->sin_port = htons(port); if (inet_pton(AF_INET, addrstr, &sin->sin_addr) <= 0) { free(sin, M_RPC); free(ret, M_RPC); ret = NULL; goto out; } sin->sin_len = ret->maxlen = ret->len = sizeof *sin; ret->buf = sin; break; #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)malloc(sizeof *sin6, M_RPC, M_WAITOK); - if (sin6 == NULL) - goto out; memset(sin6, 0, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(port); if (inet_pton(AF_INET6, addrstr, &sin6->sin6_addr) <= 0) { free(sin6, M_RPC); free(ret, M_RPC); ret = NULL; goto out; } sin6->sin6_len = ret->maxlen = ret->len = sizeof *sin6; ret->buf = sin6; break; #endif case AF_LOCAL: sun = (struct sockaddr_un *)malloc(sizeof *sun, M_RPC, M_WAITOK); - if (sun == NULL) - goto out; memset(sun, 0, sizeof *sun); sun->sun_family = AF_LOCAL; strncpy(sun->sun_path, addrstr, sizeof(sun->sun_path) - 1); ret->len = ret->maxlen = sun->sun_len = SUN_LEN(sun); ret->buf = sun; break; default: break; } out: free(addrstr, M_RPC); return ret; } int __rpc_seman2socktype(int semantics) { switch (semantics) { case NC_TPI_CLTS: return SOCK_DGRAM; case NC_TPI_COTS_ORD: return SOCK_STREAM; case NC_TPI_RAW: return SOCK_RAW; default: break; } return -1; } int __rpc_socktype2seman(int socktype) { switch (socktype) { case SOCK_DGRAM: return NC_TPI_CLTS; case SOCK_STREAM: return NC_TPI_COTS_ORD; case SOCK_RAW: return NC_TPI_RAW; default: break; } return -1; } /* * Returns the type of the network as defined in * If nettype is NULL, it defaults to NETPATH. */ static int getnettype(const char *nettype) { int i; if ((nettype == NULL) || (nettype[0] == 0)) { return (_RPC_NETPATH); /* Default */ } #if 0 nettype = strlocase(nettype); #endif for (i = 0; _rpctypelist[i].name; i++) if (strcasecmp(nettype, _rpctypelist[i].name) == 0) { return (_rpctypelist[i].type); } return (_rpctypelist[i].type); } /* * For the given nettype (tcp or udp only), return the first structure found. * This should be freed by calling freenetconfigent() */ struct netconfig * __rpc_getconfip(const char *nettype) { char *netid; static char *netid_tcp = (char *) NULL; static char *netid_udp = (char *) NULL; struct netconfig *dummy; if (!netid_udp && !netid_tcp) { struct netconfig *nconf; void *confighandle; if (!(confighandle = setnetconfig())) { log(LOG_ERR, "rpc: failed to open " NETCONFIG); return (NULL); } while ((nconf = getnetconfig(confighandle)) != NULL) { if (strcmp(nconf->nc_protofmly, NC_INET) == 0) { if (strcmp(nconf->nc_proto, NC_TCP) == 0) { netid_tcp = strdup(nconf->nc_netid, M_RPC); } else if (strcmp(nconf->nc_proto, NC_UDP) == 0) { netid_udp = strdup(nconf->nc_netid, M_RPC); } } } endnetconfig(confighandle); } if (strcmp(nettype, "udp") == 0) netid = netid_udp; else if (strcmp(nettype, "tcp") == 0) netid = netid_tcp; else { return (NULL); } if ((netid == NULL) || (netid[0] == 0)) { return (NULL); } dummy = getnetconfigent(netid); return (dummy); } /* * Returns the type of the nettype, which should then be used with * __rpc_getconf(). * * For simplicity in the kernel, we don't support the NETPATH * environment variable. We behave as userland would then NETPATH is * unset, i.e. iterate over all visible entries in netconfig. */ void * __rpc_setconf(nettype) const char *nettype; { struct handle *handle; handle = (struct handle *) malloc(sizeof (struct handle), M_RPC, M_WAITOK); switch (handle->nettype = getnettype(nettype)) { case _RPC_NETPATH: case _RPC_CIRCUIT_N: case _RPC_DATAGRAM_N: if (!(handle->nhandle = setnetconfig())) goto failed; handle->nflag = TRUE; break; case _RPC_VISIBLE: case _RPC_CIRCUIT_V: case _RPC_DATAGRAM_V: case _RPC_TCP: case _RPC_UDP: if (!(handle->nhandle = setnetconfig())) { log(LOG_ERR, "rpc: failed to open " NETCONFIG); goto failed; } handle->nflag = FALSE; break; default: goto failed; } return (handle); failed: free(handle, M_RPC); return (NULL); } /* * Returns the next netconfig struct for the given "net" type. * __rpc_setconf() should have been called previously. */ struct netconfig * __rpc_getconf(void *vhandle) { struct handle *handle; struct netconfig *nconf; handle = (struct handle *)vhandle; if (handle == NULL) { return (NULL); } for (;;) { if (handle->nflag) { nconf = getnetconfig(handle->nhandle); if (nconf && !(nconf->nc_flag & NC_VISIBLE)) continue; } else { nconf = getnetconfig(handle->nhandle); } if (nconf == NULL) break; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) continue; switch (handle->nettype) { case _RPC_VISIBLE: if (!(nconf->nc_flag & NC_VISIBLE)) continue; /* FALLTHROUGH */ case _RPC_NETPATH: /* Be happy */ break; case _RPC_CIRCUIT_V: if (!(nconf->nc_flag & NC_VISIBLE)) continue; /* FALLTHROUGH */ case _RPC_CIRCUIT_N: if ((nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) continue; break; case _RPC_DATAGRAM_V: if (!(nconf->nc_flag & NC_VISIBLE)) continue; /* FALLTHROUGH */ case _RPC_DATAGRAM_N: if (nconf->nc_semantics != NC_TPI_CLTS) continue; break; case _RPC_TCP: if (((nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) || (strcmp(nconf->nc_protofmly, NC_INET) #ifdef INET6 && strcmp(nconf->nc_protofmly, NC_INET6)) #else ) #endif || strcmp(nconf->nc_proto, NC_TCP)) continue; break; case _RPC_UDP: if ((nconf->nc_semantics != NC_TPI_CLTS) || (strcmp(nconf->nc_protofmly, NC_INET) #ifdef INET6 && strcmp(nconf->nc_protofmly, NC_INET6)) #else ) #endif || strcmp(nconf->nc_proto, NC_UDP)) continue; break; } break; } return (nconf); } void __rpc_endconf(vhandle) void * vhandle; { struct handle *handle; handle = (struct handle *) vhandle; if (handle == NULL) { return; } endnetconfig(handle->nhandle); free(handle, M_RPC); } int __rpc_sockisbound(struct socket *so) { struct sockaddr *sa; int error, bound; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) return (0); switch (sa->sa_family) { case AF_INET: bound = (((struct sockaddr_in *) sa)->sin_port != 0); break; #ifdef INET6 case AF_INET6: bound = (((struct sockaddr_in6 *) sa)->sin6_port != 0); break; #endif case AF_LOCAL: /* XXX check this */ bound = (((struct sockaddr_un *) sa)->sun_path[0] != '\0'); break; default: bound = FALSE; break; } free(sa, M_SONAME); return bound; } /* * Implement XDR-style API for RPC call. */ enum clnt_stat clnt_call_private( CLIENT *cl, /* client handle */ struct rpc_callextra *ext, /* call metadata */ rpcproc_t proc, /* procedure number */ xdrproc_t xargs, /* xdr routine for args */ void *argsp, /* pointer to args */ xdrproc_t xresults, /* xdr routine for results */ void *resultsp, /* pointer to results */ struct timeval utimeout) /* seconds to wait before giving up */ { XDR xdrs; struct mbuf *mreq; struct mbuf *mrep; enum clnt_stat stat; mreq = m_getcl(M_WAITOK, MT_DATA, 0); xdrmbuf_create(&xdrs, mreq, XDR_ENCODE); if (!xargs(&xdrs, argsp)) { m_freem(mreq); return (RPC_CANTENCODEARGS); } XDR_DESTROY(&xdrs); stat = CLNT_CALL_MBUF(cl, ext, proc, mreq, &mrep, utimeout); m_freem(mreq); if (stat == RPC_SUCCESS) { xdrmbuf_create(&xdrs, mrep, XDR_DECODE); if (!xresults(&xdrs, resultsp)) { XDR_DESTROY(&xdrs); return (RPC_CANTDECODERES); } XDR_DESTROY(&xdrs); } return (stat); } /* * Bind a socket to a privileged IP port */ int bindresvport(struct socket *so, struct sockaddr *sa) { int old, error, af; bool_t freesa = FALSE; struct sockaddr_in *sin; #ifdef INET6 struct sockaddr_in6 *sin6; #endif struct sockopt opt; int proto, portrange, portlow; u_int16_t *portp; socklen_t salen; if (sa == NULL) { CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) return (error); freesa = TRUE; af = sa->sa_family; salen = sa->sa_len; memset(sa, 0, sa->sa_len); } else { af = sa->sa_family; salen = sa->sa_len; } switch (af) { case AF_INET: proto = IPPROTO_IP; portrange = IP_PORTRANGE; portlow = IP_PORTRANGE_LOW; sin = (struct sockaddr_in *)sa; portp = &sin->sin_port; break; #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; portrange = IPV6_PORTRANGE; portlow = IPV6_PORTRANGE_LOW; sin6 = (struct sockaddr_in6 *)sa; portp = &sin6->sin6_port; break; #endif default: return (EPFNOSUPPORT); } sa->sa_family = af; sa->sa_len = salen; if (*portp == 0) { bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_GET; opt.sopt_level = proto; opt.sopt_name = portrange; opt.sopt_val = &old; opt.sopt_valsize = sizeof(old); error = sogetopt(so, &opt); if (error) { goto out; } opt.sopt_dir = SOPT_SET; opt.sopt_val = &portlow; error = sosetopt(so, &opt); if (error) goto out; } error = sobind(so, sa, curthread); if (*portp == 0) { if (error) { opt.sopt_dir = SOPT_SET; opt.sopt_val = &old; sosetopt(so, &opt); } } out: if (freesa) free(sa, M_SONAME); return (error); } /* * Kernel module glue */ static int krpc_modevent(module_t mod, int type, void *data) { return (0); } static moduledata_t krpc_mod = { "krpc", krpc_modevent, NULL, }; DECLARE_MODULE(krpc, krpc_mod, SI_SUB_VFS, SI_ORDER_ANY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(krpc, 1); Index: head/sys/ufs/ufs/ufs_extattr.c =================================================================== --- head/sys/ufs/ufs/ufs_extattr.c (revision 297390) +++ head/sys/ufs/ufs/ufs_extattr.c (revision 297391) @@ -1,1300 +1,1298 @@ /*- * Copyright (c) 1999-2002 Robert N. M. Watson * Copyright (c) 2002-2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Support for filesystem extended attribute: UFS-specific support functions. */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_EXTATTR static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute"); static int ufs_extattr_sync = 0; SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync, 0, ""); static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname); static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td); static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td); static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td); static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td); #ifdef UFS_EXTATTR_AUTOSTART static int ufs_extattr_autostart_locked(struct mount *mp, struct thread *td); #endif static int ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td); /* * Per-FS attribute lock protecting attribute operations. * * XXXRW: Perhaps something more fine-grained would be appropriate, but at * the end of the day we're going to contend on the vnode lock for the * backing file anyway. */ static void ufs_extattr_uepm_lock(struct ufsmount *ump) { sx_xlock(&ump->um_extattr.uepm_lock); } static void ufs_extattr_uepm_unlock(struct ufsmount *ump) { sx_xunlock(&ump->um_extattr.uepm_lock); } /*- * Determine whether the name passed is a valid name for an actual * attribute. * * Invalid currently consists of: * NULL pointer for attrname * zero-length attrname (used to retrieve application attribute list) */ static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) { if (attrname == NULL) return (0); if (strlen(attrname) == 0) return (0); return (1); } /* * Locate an attribute given a name and mountpoint. * Must be holding uepm lock for the mount point. */ static struct ufs_extattr_list_entry * ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, const char *attrname) { struct ufs_extattr_list_entry *search_attribute; sx_assert(&ump->um_extattr.uepm_lock, SA_XLOCKED); for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); search_attribute != NULL; search_attribute = LIST_NEXT(search_attribute, uele_entries)) { if (!(strncmp(attrname, search_attribute->uele_attrname, UFS_EXTATTR_MAXEXTATTRNAME)) && (attrnamespace == search_attribute->uele_attrnamespace)) { return (search_attribute); } } return (0); } /* * Initialize per-FS structures supporting extended attributes. Do not * start extended attributes yet. */ void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) { uepm->uepm_flags = 0; LIST_INIT(&uepm->uepm_list); sx_init(&uepm->uepm_lock, "ufs_extattr_sx"); uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; } /* * Destroy per-FS structures supporting extended attributes. Assumes * that EAs have already been stopped, and will panic if not. */ void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) { if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) panic("ufs_extattr_uepm_destroy: not initialized"); if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) panic("ufs_extattr_uepm_destroy: called while still started"); /* * It's not clear that either order for the next two lines is * ideal, and it should never be a problem if this is only called * during unmount, and with vfs_busy(). */ uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; sx_destroy(&uepm->uepm_lock); } /* * Start extended attribute support on an FS. */ int ufs_extattr_start(struct mount *mp, struct thread *td) { struct ufsmount *ump; int error = 0; ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump); error = ufs_extattr_start_locked(ump, td); ufs_extattr_uepm_unlock(ump); return (error); } static int ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td) { if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return (EOPNOTSUPP); if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) return (EBUSY); ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; ump->um_extattr.uepm_ucred = crhold(td->td_ucred); return (0); } #ifdef UFS_EXTATTR_AUTOSTART /* * Helper routine: given a locked parent directory and filename, return * the locked vnode of the inode associated with the name. Will not * follow symlinks, may return any type of vnode. Lock on parent will * be released even in the event of a failure. In the event that the * target is the parent (i.e., "."), there will be two references and * one lock, requiring the caller to possibly special-case. */ #define UE_GETDIR_LOCKPARENT 1 #define UE_GETDIR_LOCKPARENT_DONT 2 static int ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname, struct vnode **vp, struct thread *td) { struct vop_cachedlookup_args vargs; struct componentname cnp; struct vnode *target_vp; int error; bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; if (lockparent == UE_GETDIR_LOCKPARENT) cnp.cn_flags |= LOCKPARENT; cnp.cn_lkflags = LK_EXCLUSIVE; cnp.cn_thread = td; cnp.cn_cred = td->td_ucred; cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); cnp.cn_nameptr = cnp.cn_pnbuf; error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN, (size_t *) &cnp.cn_namelen); if (error) { if (lockparent == UE_GETDIR_LOCKPARENT_DONT) { VOP_UNLOCK(start_dvp, 0); } uma_zfree(namei_zone, cnp.cn_pnbuf); printf("ufs_extattr_lookup: copystr failed\n"); return (error); } cnp.cn_namelen--; /* trim nul termination */ vargs.a_gen.a_desc = NULL; vargs.a_dvp = start_dvp; vargs.a_vpp = &target_vp; vargs.a_cnp = &cnp; error = ufs_lookup(&vargs); uma_zfree(namei_zone, cnp.cn_pnbuf); if (error) { /* * Error condition, may have to release the lock on the parent * if ufs_lookup() didn't. */ if (lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0); /* * Check that ufs_lookup() didn't release the lock when we * didn't want it to. */ if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); return (error); } /* if (target_vp == start_dvp) panic("ufs_extattr_lookup: target_vp == start_dvp"); */ if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0); if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); /* printf("ufs_extattr_lookup: success\n"); */ *vp = target_vp; return (0); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Enable an EA using the passed filesystem, backing vnode, attribute name, * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp * to be locked when passed in. The vnode will be returned unlocked, * regardless of success/failure of the function. As a result, the caller * will always need to vrele(), but not vput(). */ static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { int error; error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL); if (error) { printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " "with %d\n", error); VOP_UNLOCK(vp, 0); return (error); } VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); vref(vp); VOP_UNLOCK(vp, 0); error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td); if (error != 0) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); return (error); } #ifdef UFS_EXTATTR_AUTOSTART /* * Given a locked directory vnode, iterate over the names in the directory * and use ufs_extattr_lookup() to retrieve locked vnodes of potential * attribute files. Then invoke ufs_extattr_enable_with_open() on each * to attempt to start the attribute. Leaves the directory locked on * exit. */ static int ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, int attrnamespace, struct thread *td) { struct vop_readdir_args vargs; struct dirent *dp, *edp; struct vnode *attr_vp; struct uio auio; struct iovec aiov; char *dirbuf; int error, eofflag = 0; if (dvp->v_type != VDIR) return (ENOTDIR); dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; vargs.a_gen.a_desc = NULL; vargs.a_vp = dvp; vargs.a_uio = &auio; vargs.a_cred = td->td_ucred; vargs.a_eofflag = &eofflag; vargs.a_ncookies = NULL; vargs.a_cookies = NULL; while (!eofflag) { auio.uio_resid = DIRBLKSIZ; aiov.iov_base = dirbuf; aiov.iov_len = DIRBLKSIZ; error = ufs_readdir(&vargs); if (error) { printf("ufs_extattr_iterate_directory: ufs_readdir " "%d\n", error); return (error); } edp = (struct dirent *)&dirbuf[DIRBLKSIZ - auio.uio_resid]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { if (dp->d_reclen == 0) break; error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT, dp->d_name, &attr_vp, td); if (error) { printf("ufs_extattr_iterate_directory: lookup " "%s %d\n", dp->d_name, error); } else if (attr_vp == dvp) { vrele(attr_vp); } else if (attr_vp->v_type != VREG) { vput(attr_vp); } else { error = ufs_extattr_enable_with_open(ump, attr_vp, attrnamespace, dp->d_name, td); vrele(attr_vp); if (error) { printf("ufs_extattr_iterate_directory: " "enable %s %d\n", dp->d_name, error); } else if (bootverbose) { printf("UFS autostarted EA %s\n", dp->d_name); } } dp = (struct dirent *) ((char *)dp + dp->d_reclen); if (dp >= edp) break; } } free(dirbuf, M_TEMP); return (0); } /* * Auto-start of extended attributes, to be executed (optionally) at * mount-time. */ int ufs_extattr_autostart(struct mount *mp, struct thread *td) { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump); error = ufs_extattr_autostart_locked(mp, td); ufs_extattr_uepm_unlock(ump); return (error); } static int ufs_extattr_autostart_locked(struct mount *mp, struct thread *td) { struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; struct ufsmount *ump = VFSTOUFS(mp); int error; /* * UFS_EXTATTR applies only to UFS1, as UFS2 uses native extended * attributes, so don't autostart. */ if (ump->um_fstype != UFS1) return (0); /* * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? * If so, automatically start EA's. */ error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp); if (error) { printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", error); return (error); } error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT, UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td); if (error) { /* rvp ref'd but now unlocked */ vrele(rvp); return (error); } if (rvp == attr_dvp) { /* Should never happen. */ vput(rvp); vrele(attr_dvp); return (EINVAL); } vrele(rvp); if (attr_dvp->v_type != VDIR) { printf("ufs_extattr_autostart: %s != VDIR\n", UFS_EXTATTR_FSROOTSUBDIR); goto return_vput_attr_dvp; } error = ufs_extattr_start_locked(ump, td); if (error) { printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", error); goto return_vput_attr_dvp; } /* * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, * and start with appropriate type. Failures in either don't * result in an over-all failure. attr_dvp is left locked to * be cleaned up on exit. */ error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_system_dvp); } error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_user_dvp, EXTATTR_NAMESPACE_USER, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_user_dvp); } /* Mask startup failures in sub-directories. */ error = 0; return_vput_attr_dvp: vput(attr_dvp); return (error); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Stop extended attribute support on an FS. */ int ufs_extattr_stop(struct mount *mp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct ufsmount *ump = VFSTOUFS(mp); int error = 0; ufs_extattr_uepm_lock(ump); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto unlock; } while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) { ufs_extattr_disable(ump, uele->uele_attrnamespace, uele->uele_attrname, td); } ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; crfree(ump->um_extattr.uepm_ucred); ump->um_extattr.uepm_ucred = NULL; unlock: ufs_extattr_uepm_unlock(ump); return (error); } /* * Enable a named attribute on the specified filesystem; provide an * unlocked backing vnode to hold the attribute data. */ static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct iovec aiov; struct uio auio; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); if (backing_vnode->v_type != VREG) return (EINVAL); attribute = malloc(sizeof(struct ufs_extattr_list_entry), M_UFS_EXTATTR, M_WAITOK); - if (attribute == NULL) - return (ENOMEM); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto free_exit; } if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { error = EEXIST; goto free_exit; } strncpy(attribute->uele_attrname, attrname, UFS_EXTATTR_MAXEXTATTRNAME); attribute->uele_attrnamespace = attrnamespace; bzero(&attribute->uele_fileheader, sizeof(struct ufs_extattr_fileheader)); attribute->uele_backing_vnode = backing_vnode; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (caddr_t) &attribute->uele_fileheader; aiov.iov_len = sizeof(struct ufs_extattr_fileheader); auio.uio_resid = sizeof(struct ufs_extattr_fileheader); auio.uio_offset = (off_t) 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto unlock_free_exit; if (auio.uio_resid != 0) { printf("ufs_extattr_enable: malformed attribute header\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { printf("ufs_extattr_enable: invalid attribute header magic\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { printf("ufs_extattr_enable: incorrect attribute header " "version\n"); error = EINVAL; goto unlock_free_exit; } ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable"); LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries); VOP_UNLOCK(backing_vnode, 0); return (0); unlock_free_exit: VOP_UNLOCK(backing_vnode, 0); free_exit: free(attribute, M_UFS_EXTATTR); return (error); } /* * Disable extended attribute support on an FS. */ static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td) { struct ufs_extattr_list_entry *uele; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); if (!uele) return (ENOATTR); LIST_REMOVE(uele, uele_entries); vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable"); VOP_UNLOCK(uele->uele_backing_vnode, 0); error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, td->td_ucred, td); free(uele, M_UFS_EXTATTR); return (error); } /* * VFS call to manage extended attributes in UFS. If filename_vp is * non-NULL, it must be passed in locked, and regardless of errors in * processing, will be unlocked. */ int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { struct ufsmount *ump = VFSTOUFS(mp); struct thread *td = curthread; int error; /* * Processes with privilege, but in jail, are not allowed to * configure extended attributes. */ error = priv_check(td, PRIV_UFS_EXTATTRCTL); if (error) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0); return (error); } /* * We only allow extattrctl(2) on UFS1 file systems, as UFS2 uses * native extended attributes. */ if (ump->um_fstype != UFS1) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0); return (EOPNOTSUPP); } switch(cmd) { case UFS_EXTATTR_CMD_START: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_start(mp, td); return (error); case UFS_EXTATTR_CMD_STOP: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_stop(mp, td); return (error); case UFS_EXTATTR_CMD_ENABLE: if (filename_vp == NULL) return (EINVAL); if (attrname == NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } /* * ufs_extattr_enable_with_open() will always unlock the * vnode, regardless of failure. */ ufs_extattr_uepm_lock(ump); error = ufs_extattr_enable_with_open(ump, filename_vp, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump); return (error); case UFS_EXTATTR_CMD_DISABLE: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname == NULL) return (EINVAL); ufs_extattr_uepm_lock(ump); error = ufs_extattr_disable(ump, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump); return (error); default: return (EINVAL); } } /* * Vnode operating to retrieve a named extended attribute. */ int ufs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump); error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Real work associated with retrieving a named attribute--assumes that * the attribute lock has already been grabbed. */ static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; size_t len, old_len; int error = 0; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (strlen(name) == 0) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VREAD); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Allow only offsets of zero to encourage the read/replace * extended attribute semantic. Otherwise we can't guarantee * atomicity, as we don't provide locks for extended attributes. */ if (uio != NULL && uio->uio_offset != 0) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Read in the data header to see if the data is defined, and if so * how much. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the getattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number * than the attribute data. For now, the best solution * is to coerce this to undefined, and let it get cleaned * up by the next write or extattrctl clean. */ printf("ufs_extattr_get (%s): inode number inconsistency (%d, %ju)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (uintmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Local size consistency check. */ if (ueh.ueh_len > attribute->uele_fileheader.uef_size) { error = ENXIO; goto vopunlock_exit; } /* Return full data size if caller requested it. */ if (size != NULL) *size = ueh.ueh_len; /* Return data if the caller requested it. */ if (uio != NULL) { /* Allow for offset into the attribute data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); /* * Figure out maximum to transfer -- use buffer size and * local data limit. */ len = MIN(uio->uio_resid, ueh.ueh_len); old_len = uio->uio_resid; uio->uio_resid = len; error = VOP_READ(attribute->uele_backing_vnode, uio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; uio->uio_resid = old_len - (len - uio->uio_resid); } vopunlock_exit: if (uio != NULL) uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Vnode operation to remove a named attribute. */ int ufs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump); error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Vnode operation to set a named attribute. */ int ufs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; /* * XXX: No longer a supported way to delete extended attributes. */ if (ap->a_uio == NULL) return (EINVAL); ufs_extattr_uepm_lock(ump); error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Real work associated with setting a vnode's extended attributes; * assumes that the attribute lock has already been grabbed. */ static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Early rejection of invalid offsets/length. * Reject: any offset but 0 (replace) * Any size greater than attribute size limit */ if (uio->uio_offset != 0 || uio->uio_resid > attribute->uele_fileheader.uef_size) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Write out a data header for the data. */ ueh.ueh_len = uio->uio_resid; ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE; ueh.ueh_i_gen = ip->i_gen; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the setattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) { error = ENXIO; goto vopunlock_exit; } /* * Write out user data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, ump->um_extattr.uepm_ucred); vopunlock_exit: uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Real work associated with removing an extended attribute from a vnode. * Assumes the attribute lock has already been grabbed. */ static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Check to see if currently defined. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Don't need to get the lock on the backing vnode if the vnode we're * modifying is it, as we already hold the lock. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number than * the attribute data. For now, the best solution is to * coerce this to undefined, and let it get cleaned up by * the next write or extattrctl clean. */ printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Flag it as not in use. */ ueh.ueh_flags = 0; ueh.ueh_len = 0; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) error = ENXIO; vopunlock_exit: VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Called by UFS when an inode is no longer active and should have its * attributes stripped. */ void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); /* * In that case, we cannot lock. We should not have any active vnodes * on the fs if this is not yet initialized but is going to be, so * this can go unlocked. */ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return; ufs_extattr_uepm_lock(ump); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { ufs_extattr_uepm_unlock(ump); return; } LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) ufs_extattr_rm(vp, uele->uele_attrnamespace, uele->uele_attrname, NULL, td); ufs_extattr_uepm_unlock(ump); } #endif /* !UFS_EXTATTR */