diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 0df21402683d..5d7802f929ae 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -1,1365 +1,1386 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * Copyright (C) 2015 Mihai Carabas * All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 struct vm_memseg_12 { int segid; size_t len; char name[64]; }; _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI"); #define VM_ALLOC_MEMSEG_12 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12) #define VM_GET_MEMSEG_12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12) #endif /* COMPAT_FREEBSD12 */ #ifdef COMPAT_FREEBSD14 struct vm_memseg_14 { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; }; _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16), "COMPAT_FREEBSD14 ABI"); #define VM_ALLOC_MEMSEG_14 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14) #define VM_GET_MEMSEG_14 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14) #endif /* COMPAT_FREEBSD14 */ #endif /* __amd64__ */ struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; LIST_ENTRY(vmmdev_softc) priv_link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; struct vmmctl_priv { LIST_HEAD(, vmmdev_softc) softcs; }; static bool vmm_initialized = false; static SLIST_HEAD(, vmmdev_softc) head; static unsigned int pr_allow_vmm_flag, pr_allow_vmm_ppt_flag; static struct sx vmmdev_mtx; SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex"); static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); u_int vm_maxvmms; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxvmms, CTLFLAG_RWTUN, &vm_maxvmms, 0, "Maximum number of VMM instances per user"); static void devmem_destroy(void *arg); static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem); static void vmmdev_destroy(struct vmmdev_softc *sc); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && (ucred->cr_prison->pr_allow & pr_allow_vmm_flag) == 0) return (EPERM); return (0); } static int vcpu_lock_one(struct vcpu *vcpu) { return (vcpu_set_state(vcpu, VCPU_FROZEN, true)); } static void vcpu_unlock_one(struct vcpu *vcpu) { enum vcpu_state state; state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state); } vcpu_set_state(vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { int error; /* * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked * in a consistent order so we need to serialize to avoid deadlocks. */ vm_lock_vcpus(sc->vm); error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); if (error != 0) vm_unlock_vcpus(sc->vm); return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; uint16_t i, maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } vm_unlock_vcpus(sc->vm); } static struct vmmdev_softc * vmmdev_lookup(const char *name, struct ucred *cred) { struct vmmdev_softc *sc; sx_assert(&vmmdev_mtx, SA_XLOCKED); SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(cred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); error = 0; prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vm_unlock_memsegs(sc->vm); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len, struct domainset *domainset) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(sc, mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } #if defined(__amd64__) && \ (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12)) /* * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts. */ static void adjust_segid(struct vm_memseg *mseg) { if (mseg->segid != VM_SYSMEM) { mseg->segid += (VM_BOOTROM - 1); } } #endif static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { int error; /* * A jail without vmm access shouldn't be able to access vmm device * files at all, but check here just to be thorough. */ error = vmm_priv_check(td->td_ucred); if (error != 0) return (error); return (0); } static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STAT_DESC, 0), #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif #ifdef COMPAT_FREEBSD14 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif #endif /* __amd64__ */ VMMDEV_IOCTL(VM_ALLOC_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MMAP_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MUNMAP_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_REINIT, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #ifdef __amd64__ #if defined(COMPAT_FREEBSD12) VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif #ifdef COMPAT_FREEBSD14 VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif #endif /* __amd64__ */ VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), VMMDEV_IOCTL(VM_SUSPEND, 0), VMMDEV_IOCTL(VM_GET_CPUS, 0), VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0), VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0), }; static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct vmmdev_softc *sc; struct vcpu *vcpu; const struct vmmdev_ioctl *ioctl; struct vm_memseg *mseg; int error, vcpuid; sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); ioctl = NULL; for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) { if (vmmdev_ioctls[i].cmd == cmd) { ioctl = &vmmdev_ioctls[i]; break; } } if (ioctl == NULL) { for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) { if (vmmdev_machdep_ioctls[i].cmd == cmd) { ioctl = &vmmdev_machdep_ioctls[i]; break; } } } if (ioctl == NULL) return (ENOTTY); if ((ioctl->flags & VMMDEV_IOCTL_PPT) != 0) { if (jailed(td->td_ucred) && (td->td_ucred->cr_prison->pr_allow & pr_allow_vmm_ppt_flag) == 0) return (EPERM); error = priv_check(td, PRIV_VMM_PPTDEV); if (error != 0) return (error); } if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0) vm_xlock_memsegs(sc->vm); else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0) vm_slock_memsegs(sc->vm); vcpu = NULL; vcpuid = -1; if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU | VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) { vcpuid = *(int *)data; if (vcpuid == -1) { if ((ioctl->flags & VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) { error = EINVAL; goto lockfail; } } else { vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto lockfail; } if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) { error = vcpu_lock_one(vcpu); if (error) goto lockfail; } } } if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) { error = vcpu_lock_all(sc); if (error) goto lockfail; } switch (cmd) { case VM_SUSPEND: { struct vm_suspend *vmsuspend; vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; } case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { struct vm_stat_desc *statdesc; statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } case VM_STATS: { struct vm_stats *vmstats; vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_MMAP_GETNEXT: { struct vm_memmap *mm; mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; } case VM_MMAP_MEMSEG: { struct vm_memmap *mm; mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; } case VM_MUNMAP_MEMSEG: { struct vm_munmap *mu; mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; } #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_12: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = alloc_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name), NULL); break; case VM_GET_MEMSEG_12: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name)); break; #endif /* COMPAT_FREEBSD12 */ #ifdef COMPAT_FREEBSD14 case VM_ALLOC_MEMSEG_14: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = alloc_memseg(sc, mseg, sizeof(((struct vm_memseg_14 *)0)->name), NULL); break; case VM_GET_MEMSEG_14: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_14 *)0)->name)); break; #endif /* COMPAT_FREEBSD14 */ #endif /* __amd64__ */ case VM_ALLOC_MEMSEG: { domainset_t *mask; struct domainset *domainset, domain; domainset = NULL; mseg = (struct vm_memseg *)data; if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) { if (mseg->ds_mask_size < sizeof(domainset_t) || mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) { error = ERANGE; break; } memset(&domain, 0, sizeof(domain)); mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK); error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size); if (error) { free(mask, M_VMMDEV); break; } error = domainset_populate(&domain, mask, mseg->ds_policy, mseg->ds_mask_size); free(mask, M_VMMDEV); if (error) break; domainset = domainset_create(&domain); if (domainset == NULL) { error = EINVAL; break; } } error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); break; } case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: { struct vm_register *vmreg; vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; } case VM_SET_REGISTER: { struct vm_register *vmreg; vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; } case VM_GET_REGISTER_SET: { struct vm_register_set *vmregset; uint64_t *regvals; int *regnums; vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = mallocarray(vmregset->count, sizeof(regvals[0]), M_VMMDEV, M_WAITOK); regnums = mallocarray(vmregset->count, sizeof(regnums[0]), M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; } case VM_SET_REGISTER_SET: { struct vm_register_set *vmregset; uint64_t *regvals; int *regnums; vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = mallocarray(vmregset->count, sizeof(regvals[0]), M_VMMDEV, M_WAITOK); regnums = mallocarray(vmregset->count, sizeof(regnums[0]), M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; } case VM_GET_CAPABILITY: { struct vm_capability *vmcap; vmcap = (struct vm_capability *)data; error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; } case VM_SET_CAPABILITY: { struct vm_capability *vmcap; vmcap = (struct vm_capability *)data; error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; } case VM_ACTIVATE_CPU: error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: { struct vm_cpuset *vm_cpuset; cpuset_t *cpuset; int size; error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < 1 || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY)) error = ERANGE; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; } case VM_SUSPEND_CPU: error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_TOPOLOGY: { struct vm_cpu_topology *topology; topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; } case VM_GET_TOPOLOGY: { struct vm_cpu_topology *topology; topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; } default: error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag, td); break; } if ((ioctl->flags & (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) vm_unlock_memsegs(sc->vm); if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) vcpu_unlock_all(sc); else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) vcpu_unlock_one(vcpu); /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); lockfail: if ((ioctl->flags & (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) vm_unlock_memsegs(sc->vm); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; bool sysmem; first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vm_unlock_memsegs(sc->vm); return (error); } static void vmmdev_destroy(struct vmmdev_softc *sc) { struct devmem_softc *dsc; int error __diagused; KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__)); KASSERT(sc->ucred != NULL, ("%s: missing ucred", __func__)); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); devmem_destroy(dsc); } vm_disable_vcpu_creation(sc->vm); error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); vm_unlock_vcpus(sc->vm); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } vm_destroy(sc->vm); chgvmmcnt(sc->ucred->cr_ruidinfo, -1, 0); crfree(sc->ucred); sx_xlock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); if ((sc->flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0) LIST_REMOVE(sc, priv_link); sx_xunlock(&vmmdev_mtx); wakeup(sc); free(sc, M_VMMDEV); } static int vmmdev_lookup_and_destroy(const char *name, struct ucred *cred) { struct cdev *cdev; struct vmmdev_softc *sc; + int error; sx_xlock(&vmmdev_mtx); sc = vmmdev_lookup(name, cred); if (sc == NULL || sc->cdev == NULL) { sx_xunlock(&vmmdev_mtx); return (EINVAL); } + /* + * Only the creator of a VM or a privileged user can destroy it. + */ + if ((cred->cr_uid != sc->ucred->cr_uid || + cred->cr_prison != sc->ucred->cr_prison) && + (error = priv_check_cred(cred, PRIV_VMM_DESTROY)) != 0) { + sx_xunlock(&vmmdev_mtx); + return (error); + } + /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; sx_xunlock(&vmmdev_mtx); (void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY); destroy_dev(cdev); vmmdev_destroy(sc); return (0); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred); free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", "Destroy a vmm(4) instance (legacy interface)"); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_open = vmmdev_open, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static struct vmmdev_softc * vmmdev_alloc(struct vm *vm, struct ucred *cred) { struct vmmdev_softc *sc; sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO); SLIST_INIT(&sc->devmem); sc->vm = vm; sc->ucred = crhold(cred); return (sc); } static int vmmdev_create(const char *name, uint32_t flags, struct ucred *cred) { struct make_dev_args mda; struct cdev *cdev; struct vmmdev_softc *sc; struct vmmctl_priv *priv; struct vm *vm; int error; if (name == NULL || strlen(name) > VM_MAX_NAMELEN) return (EINVAL); if ((flags & ~VMMCTL_FLAGS_MASK) != 0) return (EINVAL); error = devfs_get_cdevpriv((void **)&priv); if (error) return (error); sx_xlock(&vmmdev_mtx); sc = vmmdev_lookup(name, cred); if (sc != NULL) { sx_xunlock(&vmmdev_mtx); return (EEXIST); } + /* + * Unprivileged users can only create VMs that will be automatically + * destroyed when the creating descriptor is closed. + */ + if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) == 0 && + (error = priv_check_cred(cred, PRIV_VMM_CREATE)) != 0) { + sx_xunlock(&vmmdev_mtx); + return (error); + } + if (!chgvmmcnt(cred->cr_ruidinfo, 1, vm_maxvmms)) { sx_xunlock(&vmmdev_mtx); return (ENOMEM); } error = vm_create(name, &vm); if (error != 0) { sx_xunlock(&vmmdev_mtx); (void)chgvmmcnt(cred->cr_ruidinfo, -1, 0); return (error); } sc = vmmdev_alloc(vm, cred); SLIST_INSERT_HEAD(&head, sc, link); sc->flags = flags; if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0) LIST_INSERT_HEAD(&priv->softcs, sc, priv_link); make_dev_args_init(&mda); mda.mda_devsw = &vmmdevsw; mda.mda_cr = sc->ucred; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; error = make_dev_s(&mda, &cdev, "vmm/%s", name); if (error != 0) { sx_xunlock(&vmmdev_mtx); vmmdev_destroy(sc); return (error); } sc->cdev = cdev; sx_xunlock(&vmmdev_mtx); return (0); } static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { char *buf; int error, buflen; if (!vmm_initialized) return (ENXIO); error = vmm_priv_check(req->td->td_ucred); if (error != 0) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) error = vmmdev_create(buf, 0, req->td->td_ucred); free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", "Create a vmm(4) instance (legacy interface)"); static void vmmctl_dtor(void *arg) { struct cdev *sc_cdev; struct vmmdev_softc *sc; struct vmmctl_priv *priv = arg; /* * Scan the softc list for any VMs associated with * the current descriptor and destroy them. */ sx_xlock(&vmmdev_mtx); while (!LIST_EMPTY(&priv->softcs)) { sc = LIST_FIRST(&priv->softcs); sc_cdev = sc->cdev; if (sc_cdev != NULL) { sc->cdev = NULL; } else { /* * Another thread has already * started the removal process. * Sleep until 'vmmdev_destroy' notifies us * that the removal has finished. */ sx_sleep(sc, &vmmdev_mtx, 0, "vmmctl_dtor", 0); continue; } /* * Temporarily drop the lock to allow vmmdev_destroy to run. */ sx_xunlock(&vmmdev_mtx); (void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY); destroy_dev(sc_cdev); /* vmmdev_destroy will unlink the 'priv_link' entry. */ vmmdev_destroy(sc); sx_xlock(&vmmdev_mtx); } sx_xunlock(&vmmdev_mtx); free(priv, M_VMMDEV); } static int vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td) { int error; struct vmmctl_priv *priv; error = vmm_priv_check(td->td_ucred); if (error != 0) return (error); if ((flags & FWRITE) == 0) return (EPERM); priv = malloc(sizeof(*priv), M_VMMDEV, M_WAITOK | M_ZERO); LIST_INIT(&priv->softcs); error = devfs_set_cdevpriv(priv, vmmctl_dtor); if (error != 0) { free(priv, M_VMMDEV); return (error); } return (0); } static int vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch (cmd) { case VMMCTL_VM_CREATE: { struct vmmctl_vm_create *vmc; vmc = (struct vmmctl_vm_create *)data; vmc->name[VM_MAX_NAMELEN] = '\0'; for (size_t i = 0; i < nitems(vmc->reserved); i++) { if (vmc->reserved[i] != 0) { error = EINVAL; return (error); } } error = vmmdev_create(vmc->name, vmc->flags, td->td_ucred); break; } case VMMCTL_VM_DESTROY: { struct vmmctl_vm_destroy *vmd; vmd = (struct vmmctl_vm_destroy *)data; vmd->name[VM_MAX_NAMELEN] = '\0'; for (size_t i = 0; i < nitems(vmd->reserved); i++) { if (vmd->reserved[i] != 0) { error = EINVAL; return (error); } } error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred); break; } default: error = ENOTTY; break; } return (error); } static struct cdev *vmmctl_cdev; static struct cdevsw vmmctlsw = { .d_name = "vmmctl", .d_version = D_VERSION, .d_open = vmmctl_open, .d_ioctl = vmmctl_ioctl, }; static int vmmdev_init(void) { int error; sx_xlock(&vmmdev_mtx); error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmmctl"); if (error == 0) { pr_allow_vmm_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail"); pr_allow_vmm_ppt_flag = prison_add_allow(NULL, "vmm_ppt", NULL, "Allow use of vmm with ppt devices in a jail"); } sx_xunlock(&vmmdev_mtx); return (error); } static int vmmdev_cleanup(void) { sx_xlock(&vmmdev_mtx); if (!SLIST_EMPTY(&head)) { sx_xunlock(&vmmdev_mtx); return (EBUSY); } if (vmmctl_cdev != NULL) { destroy_dev(vmmctl_cdev); vmmctl_cdev = NULL; } sx_xunlock(&vmmdev_mtx); return (0); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: error = vmmdev_init(); if (error != 0) break; vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; vm_maxvmms = 4 * mp_ncpus; error = vmm_modinit(); if (error == 0) vmm_initialized = true; else { int error1 __diagused; error1 = vmmdev_cleanup(); KASSERT(error1 == 0, ("%s: vmmdev_cleanup failed: %d", __func__, error1)); } break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0 && vmm_initialized) { error = vmm_modcleanup(); if (error) { /* * Something bad happened - prevent new * VMs from being created */ vmm_initialized = false; } } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - Initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). * - vmm device initialization requires an initialized devfs. */ DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); vm_slock_memsegs(dsc->sc->vm); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); if (seglen >= last) vm_object_reference(*objp); else error = EINVAL; vm_unlock_memsegs(dsc->sc->vm); return (error); } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname) { struct make_dev_args mda; struct devmem_softc *dsc; int error; sx_xlock(&vmmdev_mtx); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); dsc->segid = segid; dsc->name = devname; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); make_dev_args_init(&mda); mda.mda_devsw = &devmemsw; mda.mda_cr = sc->ucred; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = dsc; mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm), devname); if (error != 0) { SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } sx_xunlock(&vmmdev_mtx); return (error); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; destroy_dev(dsc->cdev); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/sys/sys/priv.h b/sys/sys/priv.h index 1ad6a4882ffc..e302d9487e64 100644 --- a/sys/sys/priv.h +++ b/sys/sys/priv.h @@ -1,573 +1,575 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006 nCircle Network Security, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Privilege checking interface for BSD kernel. */ #ifndef _SYS_PRIV_H_ #define _SYS_PRIV_H_ /* * Privilege list, sorted loosely by kernel subsystem. * * Think carefully before adding or reusing one of these privileges -- are * there existing instances referring to the same privilege? Third party * vendors may request the assignment of privileges to be used in loadable * modules. Particular numeric privilege assignments are part of the * loadable kernel module ABI, and should not be changed across minor * releases. * * When adding a new privilege, remember to determine if it's appropriate * for use in jail, and update the privilege switch in prison_priv_check() * in kern_jail.c as necessary. */ /* * Track beginning of privilege list. */ #define _PRIV_LOWEST 1 /* * The remaining privileges typically correspond to one or a small * number of specific privilege checks, and have (relatively) precise * meanings. They are loosely sorted into a set of base system * privileges, such as the ability to reboot, and then loosely by * subsystem, indicated by a subsystem name. */ #define _PRIV_ROOT 1 /* Removed. */ #define PRIV_ACCT 2 /* Manage process accounting. */ #define PRIV_MAXFILES 3 /* Exceed system open files limit. */ #define PRIV_MAXPROC 4 /* Exceed system processes limit. */ #define PRIV_KTRACE 5 /* Set/clear KTRFAC_ROOT on ktrace. */ #define PRIV_SETDUMPER 6 /* Configure dump device. */ #define PRIV_REBOOT 8 /* Can reboot system. */ #define PRIV_SWAPON 9 /* Can swapon(). */ #define PRIV_SWAPOFF 10 /* Can swapoff(). */ #define PRIV_MSGBUF 11 /* Can read kernel message buffer. */ #define PRIV_IO 12 /* Can perform low-level I/O. */ #define PRIV_KEYBOARD 13 /* Reprogram keyboard. */ #define PRIV_DRIVER 14 /* Low-level driver privilege. */ #define PRIV_ADJTIME 15 /* Set time adjustment. */ #define PRIV_NTP_ADJTIME 16 /* Set NTP time adjustment. */ #define PRIV_CLOCK_SETTIME 17 /* Can call clock_settime. */ #define PRIV_SETTIMEOFDAY 18 /* Can call settimeofday. */ #define _PRIV_SETHOSTID 19 /* Removed. */ #define _PRIV_SETDOMAINNAME 20 /* Removed. */ /* * Audit subsystem privileges. */ #define PRIV_AUDIT_CONTROL 40 /* Can configure audit. */ #define PRIV_AUDIT_FAILSTOP 41 /* Can run during audit fail stop. */ #define PRIV_AUDIT_GETAUDIT 42 /* Can get proc audit properties. */ #define PRIV_AUDIT_SETAUDIT 43 /* Can set proc audit properties. */ #define PRIV_AUDIT_SUBMIT 44 /* Can submit an audit record. */ /* * Credential management privileges. */ #define PRIV_CRED_SETUID 50 /* setuid. */ #define PRIV_CRED_SETEUID 51 /* seteuid to !ruid and !svuid. */ #define PRIV_CRED_SETGID 52 /* setgid. */ #define PRIV_CRED_SETEGID 53 /* setgid to !rgid and !svgid. */ #define PRIV_CRED_SETGROUPS 54 /* Set process additional groups. */ #define PRIV_CRED_SETREUID 55 /* setreuid. */ #define PRIV_CRED_SETREGID 56 /* setregid. */ #define PRIV_CRED_SETRESUID 57 /* setresuid. */ #define PRIV_CRED_SETRESGID 58 /* setresgid. */ #define PRIV_SEEOTHERGIDS 59 /* Exempt bsd.seeothergids. */ #define PRIV_SEEOTHERUIDS 60 /* Exempt bsd.seeotheruids. */ #define PRIV_SEEJAILPROC 61 /* Exempt from bsd.see_jail_proc. */ #define PRIV_CRED_SETCRED 62 /* setcred. */ /* * Debugging privileges. */ #define PRIV_DEBUG_DIFFCRED 80 /* Exempt debugging other users. */ #define PRIV_DEBUG_SUGID 81 /* Exempt debugging setuid proc. */ #define PRIV_DEBUG_UNPRIV 82 /* Exempt unprivileged debug limit. */ #define PRIV_DEBUG_DENIED 83 /* Exempt P2_NOTRACE. */ #define PRIV_DEBUG_DIFFJAIL 84 /* Exempt debugging other jails. */ /* * Dtrace privileges. */ #define PRIV_DTRACE_KERNEL 90 /* Allow use of DTrace on the kernel. */ #define PRIV_DTRACE_PROC 91 /* Allow attaching DTrace to process. */ #define PRIV_DTRACE_USER 92 /* Process may submit DTrace events. */ /* * Firmware privilegs. */ #define PRIV_FIRMWARE_LOAD 100 /* Can load firmware. */ /* * Jail privileges. */ #define PRIV_JAIL_ATTACH 110 /* Attach to a jail. */ #define PRIV_JAIL_SET 111 /* Set jail parameters. */ #define PRIV_JAIL_REMOVE 112 /* Remove a jail. */ /* * Kernel environment privileges. */ #define PRIV_KENV_SET 120 /* Set kernel env. variables. */ #define PRIV_KENV_UNSET 121 /* Unset kernel env. variables. */ /* * Loadable kernel module privileges. */ #define PRIV_KLD_LOAD 130 /* Load a kernel module. */ #define PRIV_KLD_UNLOAD 131 /* Unload a kernel module. */ /* * Privileges associated with the MAC Framework and specific MAC policy * modules. */ #define PRIV_MAC_PARTITION 140 /* Privilege in mac_partition policy. */ #define PRIV_MAC_PRIVS 141 /* Privilege in the mac_privs policy. */ /* * Process-related privileges. */ #define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ #define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ #define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ #define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ /* * System V IPC privileges. */ #define PRIV_IPC_READ 170 /* Can override IPC read perm. */ #define PRIV_IPC_WRITE 171 /* Can override IPC write perm. */ #define PRIV_IPC_ADMIN 172 /* Can override IPC owner-only perm. */ #define PRIV_IPC_MSGSIZE 173 /* Exempt IPC message queue limit. */ /* * POSIX message queue privileges. */ #define PRIV_MQ_ADMIN 180 /* Can override msgq owner-only perm. */ /* * Performance monitoring counter privileges. */ #define PRIV_PMC_MANAGE 190 /* Can administer PMC. */ #define PRIV_PMC_SYSTEM 191 /* Can allocate a system-wide PMC. */ /* * Scheduling privileges. */ #define PRIV_SCHED_DIFFCRED 200 /* Exempt scheduling other users. */ #define PRIV_SCHED_SETPRIORITY 201 /* Can set lower nice value for proc. */ #define PRIV_SCHED_RTPRIO 202 /* Can set real time scheduling. */ #define PRIV_SCHED_SETPOLICY 203 /* Can set scheduler policy. */ #define PRIV_SCHED_SET 204 /* Can set thread scheduler. */ #define PRIV_SCHED_SETPARAM 205 /* Can set thread scheduler params. */ #define PRIV_SCHED_CPUSET 206 /* Can manipulate cpusets. */ #define PRIV_SCHED_CPUSET_INTR 207 /* Can adjust IRQ to CPU binding. */ #define PRIV_SCHED_IDPRIO 208 /* Can set idle time scheduling. */ #define PRIV_SCHED_DIFFJAIL 209 /* Exempt scheduling other jails. */ /* * POSIX semaphore privileges. */ #define PRIV_SEM_WRITE 220 /* Can override sem write perm. */ /* * Signal privileges. */ #define PRIV_SIGNAL_DIFFCRED 230 /* Exempt signalling other users. */ #define PRIV_SIGNAL_SUGID 231 /* Non-conserv signal setuid proc. */ #define PRIV_SIGNAL_DIFFJAIL 232 /* Exempt signalling other jails. */ /* * Sysctl privileges. */ #define PRIV_SYSCTL_DEBUG 240 /* Can invoke sysctl.debug. */ #define PRIV_SYSCTL_WRITE 241 /* Can write sysctls. */ #define PRIV_SYSCTL_WRITEJAIL 242 /* Can write sysctls, jail permitted. */ #define PRIV_SYSCTL_MEMLOCK 243 /* Large requests are not serialized. */ /* * TTY privileges. */ #define PRIV_TTY_CONSOLE 250 /* Set console to tty. */ #define PRIV_TTY_DRAINWAIT 251 /* Set tty drain wait time. */ #define PRIV_TTY_DTRWAIT 252 /* Set DTR wait on tty. */ #define PRIV_TTY_EXCLUSIVE 253 /* Override tty exclusive flag. */ #define _PRIV_TTY_PRISON 254 /* Removed. */ #define PRIV_TTY_STI 255 /* Simulate input on another tty. */ #define PRIV_TTY_SETA 256 /* Set tty termios structure. */ /* * UFS-specific privileges. */ #define PRIV_UFS_EXTATTRCTL 270 /* Can configure EAs on UFS1. */ #define PRIV_UFS_QUOTAOFF 271 /* quotaoff(). */ #define PRIV_UFS_QUOTAON 272 /* quotaon(). */ #define PRIV_UFS_SETUSE 273 /* setuse(). */ /* * ZFS-specific privileges. */ #define PRIV_ZFS_POOL_CONFIG 280 /* Can configure ZFS pools. */ #define PRIV_ZFS_INJECT 281 /* Can inject faults in the ZFS fault injection framework. */ #define PRIV_ZFS_JAIL 282 /* Can attach/detach ZFS file systems to/from jails. */ /* * NFS-specific privileges. */ #define PRIV_NFS_DAEMON 290 /* Can become the NFS daemon. */ #define PRIV_NFS_LOCKD 291 /* Can become NFS lock daemon. */ /* * VFS privileges. */ #define PRIV_VFS_READ 310 /* Override vnode DAC read perm. */ #define PRIV_VFS_WRITE 311 /* Override vnode DAC write perm. */ #define PRIV_VFS_ADMIN 312 /* Override vnode DAC admin perm. */ #define PRIV_VFS_EXEC 313 /* Override vnode DAC exec perm. */ #define PRIV_VFS_LOOKUP 314 /* Override vnode DAC lookup perm. */ #define PRIV_VFS_BLOCKRESERVE 315 /* Can use free block reserve. */ #define PRIV_VFS_CHFLAGS_DEV 316 /* Can chflags() a device node. */ #define PRIV_VFS_CHOWN 317 /* Can set user; group to non-member. */ #define PRIV_VFS_CHROOT 318 /* chroot(). */ #define PRIV_VFS_RETAINSUGID 319 /* Can retain sugid bits on change. */ #define PRIV_VFS_EXCEEDQUOTA 320 /* Exempt from quota restrictions. */ #define PRIV_VFS_EXTATTR_SYSTEM 321 /* Operate on system EA namespace. */ #define PRIV_VFS_FCHROOT 322 /* fchroot(). */ #define PRIV_VFS_FHOPEN 323 /* Can fhopen(). */ #define PRIV_VFS_FHSTAT 324 /* Can fhstat(). */ #define PRIV_VFS_FHSTATFS 325 /* Can fhstatfs(). */ #define PRIV_VFS_GENERATION 326 /* stat() returns generation number. */ #define PRIV_VFS_GETFH 327 /* Can retrieve file handles. */ #define PRIV_VFS_GETQUOTA 328 /* getquota(). */ #define PRIV_VFS_LINK 329 /* bsd.hardlink_check_uid */ #define PRIV_VFS_MKNOD_BAD 330 /* Was: mknod() can mark bad inodes. */ #define PRIV_VFS_MKNOD_DEV 331 /* Can mknod() to create dev nodes. */ #define PRIV_VFS_MKNOD_WHT 332 /* Can mknod() to create whiteout. */ #define PRIV_VFS_MOUNT 333 /* Can mount(). */ #define PRIV_VFS_MOUNT_OWNER 334 /* Can manage other users' file systems. */ #define PRIV_VFS_MOUNT_EXPORTED 335 /* Can set MNT_EXPORTED on mount. */ #define PRIV_VFS_MOUNT_PERM 336 /* Override dev node perms at mount. */ #define PRIV_VFS_MOUNT_SUIDDIR 337 /* Can set MNT_SUIDDIR on mount. */ #define PRIV_VFS_MOUNT_NONUSER 338 /* Can perform a non-user mount. */ #define PRIV_VFS_SETGID 339 /* Can setgid if not in group. */ #define PRIV_VFS_SETQUOTA 340 /* setquota(). */ #define PRIV_VFS_STICKYFILE 341 /* Can set sticky bit on file. */ #define PRIV_VFS_SYSFLAGS 342 /* Can modify system flags. */ #define PRIV_VFS_UNMOUNT 343 /* Can unmount(). */ #define PRIV_VFS_STAT 344 /* Override vnode MAC stat perm. */ #define PRIV_VFS_READ_DIR 345 /* Can read(2) a dirfd, needs sysctl. */ /* * Virtual memory privileges. */ #define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ #define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ #define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ #define PRIV_VM_SWAP_NOQUOTA 363 /* * Can override the global * swap reservation limits. */ #define PRIV_VM_SWAP_NORLIMIT 364 /* * Can override the per-uid * swap reservation limits. */ /* * Device file system privileges. */ #define PRIV_DEVFS_RULE 370 /* Can manage devfs rules. */ #define PRIV_DEVFS_SYMLINK 371 /* Can create symlinks in devfs. */ /* * Random number generator privileges. */ #define PRIV_RANDOM_RESEED 380 /* Closing /dev/random reseeds. */ /* * Network stack privileges. */ #define PRIV_NET_BRIDGE 390 /* Administer bridge. */ #define PRIV_NET_GRE 391 /* Administer GRE. */ #define _PRIV_NET_PPP 392 /* Removed. */ #define _PRIV_NET_SLIP 393 /* Removed. */ #define PRIV_NET_BPF 394 /* Monitor BPF. */ #define PRIV_NET_RAW 395 /* Open raw socket. */ #define PRIV_NET_ROUTE 396 /* Administer routing. */ #define PRIV_NET_TAP 397 /* Can open tap device. */ #define PRIV_NET_SETIFMTU 398 /* Set interface MTU. */ #define PRIV_NET_SETIFFLAGS 399 /* Set interface flags. */ #define PRIV_NET_SETIFCAP 400 /* Set interface capabilities. */ #define PRIV_NET_SETIFNAME 401 /* Set interface name. */ #define PRIV_NET_SETIFMETRIC 402 /* Set interface metrics. */ #define PRIV_NET_SETIFPHYS 403 /* Set interface physical layer prop. */ #define PRIV_NET_SETIFMAC 404 /* Set interface MAC label. */ #define PRIV_NET_ADDMULTI 405 /* Add multicast addr. to ifnet. */ #define PRIV_NET_DELMULTI 406 /* Delete multicast addr. from ifnet. */ #define PRIV_NET_HWIOCTL 407 /* Issue hardware ioctl on ifnet. */ #define PRIV_NET_SETLLADDR 408 /* Set interface link-level address. */ #define PRIV_NET_ADDIFGROUP 409 /* Add new interface group. */ #define PRIV_NET_DELIFGROUP 410 /* Delete interface group. */ #define PRIV_NET_IFCREATE 411 /* Create cloned interface. */ #define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ #define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ #define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ #define PRIV_NET_LAGG 415 /* Administer lagg interface. */ #define PRIV_NET_GIF 416 /* Administer gif interface. */ #define PRIV_NET_SETIFVNET 417 /* Move interface to vnet. */ #define PRIV_NET_SETIFDESCR 418 /* Set interface description. */ #define PRIV_NET_SETIFFIB 419 /* Set interface fib. */ #define PRIV_NET_VXLAN 420 /* Administer vxlan. */ #define PRIV_NET_SETLANPCP 421 /* Set LAN priority. */ #define PRIV_NET_SETVLANPCP PRIV_NET_SETLANPCP /* Alias Set VLAN priority */ #define PRIV_NET_OVPN 422 /* Administer OpenVPN DCO. */ #define PRIV_NET_ME 423 /* Administer ME interface. */ #define PRIV_NET_WG 424 /* Administer WireGuard interface. */ /* * 802.11-related privileges. */ #define PRIV_NET80211_VAP_GETKEY 440 /* Query VAP 802.11 keys. */ #define PRIV_NET80211_VAP_MANAGE 441 /* Administer 802.11 VAP */ #define PRIV_NET80211_VAP_SETMAC 442 /* Set VAP MAC address */ #define PRIV_NET80211_CREATE_VAP 443 /* Create a new VAP */ /* * Placeholder for AppleTalk privileges, not supported anymore. */ #define _PRIV_NETATALK_RESERVEDPORT 450 /* Bind low port number. */ /* * ATM privileges. */ #define PRIV_NETATM_CFG 460 #define PRIV_NETATM_ADD 461 #define PRIV_NETATM_DEL 462 #define PRIV_NETATM_SET 463 /* * Bluetooth privileges. */ #define PRIV_NETBLUETOOTH_RAW 470 /* Open raw bluetooth socket. */ /* * Netgraph and netgraph module privileges. */ #define PRIV_NETGRAPH_CONTROL 480 /* Open netgraph control socket. */ #define PRIV_NETGRAPH_TTY 481 /* Configure tty for netgraph. */ /* * IPv4 and IPv6 privileges. */ #define PRIV_NETINET_RESERVEDPORT 490 /* Bind low port number. */ #define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ #define PRIV_NETINET_DIVERT 492 /* Open IP divert socket. */ #define PRIV_NETINET_PF 493 /* Administer pf firewall. */ #define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ #define PRIV_NETINET_CARP 495 /* Administer CARP. */ #define PRIV_NETINET_MROUTE 496 /* Administer multicast routing. */ #define PRIV_NETINET_RAW 497 /* Open netinet raw socket. */ #define PRIV_NETINET_GETCRED 498 /* Query netinet pcb credentials. */ #define PRIV_NETINET_ADDRCTRL6 499 /* Administer IPv6 address scopes. */ #define PRIV_NETINET_ND6 500 /* Administer IPv6 neighbor disc. */ #define PRIV_NETINET_SCOPE6 501 /* Administer IPv6 address scopes. */ #define PRIV_NETINET_ALIFETIME6 502 /* Administer IPv6 address lifetimes. */ #define PRIV_NETINET_IPSEC 503 /* Administer IPSEC. */ #define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ #define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ #define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ #define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6. */ #define PRIV_NETINET_KTLSKEYS 508 /* Read ktls session keys. */ /* * Placeholders for IPX/SPX privileges, not supported any more. */ #define _PRIV_NETIPX_RESERVEDPORT 520 /* Bind low port number. */ #define _PRIV_NETIPX_RAW 521 /* Open netipx raw socket. */ /* * NCP privileges. */ #define PRIV_NETNCP 530 /* Use another user's connection. */ /* * SMB privileges. */ #define PRIV_NETSMB 540 /* Use another user's connection. */ /* * VM86 privileges. */ #define PRIV_VM86_INTCALL 550 /* Allow invoking vm86 int handlers. */ #define PRIV_PIPEBUF 560 /* Allow to allocate reserved pipebuf space */ /* * Set of reserved privilege values, which will be allocated to code as * needed, in order to avoid renumbering later privileges due to insertion. */ #define _PRIV_RESERVED1 561 #define _PRIV_RESERVED2 562 #define _PRIV_RESERVED3 563 #define _PRIV_RESERVED4 564 #define _PRIV_RESERVED5 565 #define _PRIV_RESERVED6 566 #define _PRIV_RESERVED7 567 #define _PRIV_RESERVED8 568 #define _PRIV_RESERVED9 569 #define _PRIV_RESERVED10 570 #define _PRIV_RESERVED11 571 #define _PRIV_RESERVED12 572 #define _PRIV_RESERVED13 573 #define _PRIV_RESERVED14 574 #define _PRIV_RESERVED15 575 /* * Define a set of valid privilege numbers that can be used by loadable * modules that don't yet have privilege reservations. Ideally, these should * not be used, since their meaning is opaque to any policies that are aware * of specific privileges, such as jail, and as such may be arbitrarily * denied. */ #define PRIV_MODULE0 600 #define PRIV_MODULE1 601 #define PRIV_MODULE2 602 #define PRIV_MODULE3 603 #define PRIV_MODULE4 604 #define PRIV_MODULE5 605 #define PRIV_MODULE6 606 #define PRIV_MODULE7 607 #define PRIV_MODULE8 608 #define PRIV_MODULE9 609 #define PRIV_MODULE10 610 #define PRIV_MODULE11 611 #define PRIV_MODULE12 612 #define PRIV_MODULE13 613 #define PRIV_MODULE14 614 #define PRIV_MODULE15 615 /* * DDB(4) privileges. */ #define PRIV_DDB_CAPTURE 620 /* Allow reading of DDB capture log. */ /* * Arla/nnpfs privileges. */ #define PRIV_NNPFS_DEBUG 630 /* Perforn ARLA_VIOC_NNPFSDEBUG. */ /* * cpuctl(4) privileges. */ #define PRIV_CPUCTL_WRMSR 640 /* Write model-specific register. */ #define PRIV_CPUCTL_UPDATE 641 /* Update cpu microcode. */ /* * Capi4BSD privileges. */ #define PRIV_C4B_RESET_CTLR 650 /* Load firmware, reset controller. */ #define PRIV_C4B_TRACE 651 /* Unrestricted CAPI message tracing. */ /* * OpenAFS privileges. */ #define PRIV_AFS_ADMIN 660 /* Can change AFS client settings. */ #define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ /* * Resource Limits privileges. */ #define PRIV_RCTL_GET_RACCT 670 #define PRIV_RCTL_GET_RULES 671 #define PRIV_RCTL_GET_LIMITS 672 #define PRIV_RCTL_ADD_RULE 673 #define PRIV_RCTL_REMOVE_RULE 674 /* * mem(4) privileges. */ #define PRIV_KMEM_READ 680 /* Open mem/kmem for reading. */ #define PRIV_KMEM_WRITE 681 /* Open mem/kmem for writing. */ #define PRIV_PROC_MEM_WRITE 682 /* Writes via proc_rwmem */ /* * Kernel debugger privileges. */ #define PRIV_KDB_SET_BACKEND 690 /* Allow setting KDB backend. */ /* * veriexec override privileges - very rare! */ #define PRIV_VERIEXEC_DIRECT 700 /* Can override 'indirect' */ #define PRIV_VERIEXEC_NOVERIFY 701 /* Can override O_VERIFY */ #define PRIV_VERIEXEC_CONTROL 702 /* Can configure veriexec */ /* * vmm privileges. */ #define PRIV_VMM_PPTDEV 710 /* Can manipulate ppt devices. */ +#define PRIV_VMM_CREATE 711 /* Can create non-temporal VMs. */ +#define PRIV_VMM_DESTROY 712 /* Can destroy other users' VMs. */ /* * Track end of privilege list. */ -#define _PRIV_HIGHEST 711 +#define _PRIV_HIGHEST 713 /* * Validate that a named privilege is known by the privilege system. Invalid * privileges presented to the privilege system by a priv_check interface * will result in a panic. This is only approximate due to sparse allocation * of the privilege space. */ #define PRIV_VALID(x) ((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST) #ifdef _KERNEL /* * Privilege check interfaces, modeled after historic suser() interfaces, but * with the addition of a specific privilege name. No flags are currently * defined for the API. Historically, flags specified using the real uid * instead of the effective uid, and whether or not the check should be * allowed in jail. */ struct thread; struct ucred; int priv_check(struct thread *td, int priv); int priv_check_cred(struct ucred *cred, int priv); int priv_check_cred_vfs_lookup(struct ucred *cred); int priv_check_cred_vfs_lookup_nomac(struct ucred *cred); int priv_check_cred_vfs_generation(struct ucred *cred); #endif #endif /* !_SYS_PRIV_H_ */